/*
 * Copyright (c) 2018-2019 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include "impl_fp32_fp32.hpp"

namespace depthwise
{

using namespace neon_convolution_kernels;
using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;

#ifdef __aarch64__
template <>
template <>
void Conv::execute_tile<ActivationFunction::None>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *input,
  const unsigned int input_row_stride,
  const unsigned int input_col_stride,
  float *output,
  const unsigned int output_row_stride,
  const unsigned int output_col_stride
)
{
  __asm __volatile(
    "add x8, %[inptr0], %[input_row_stride]\n"
    "add x15, %[input_col_stride1], %[input_col_stride1]\n"
    "add x23, %[outptr0], %[output_row_stride]\n"
    "add x9, x8, %[input_row_stride]\n"
    "add x16, x15, #64\n"
    "add x17, x15, %[input_col_stride1]\n"
    "add x10, x9, %[input_row_stride]\n"
    "add x7, x17, #64\n"
    "add x19, x17, %[input_col_stride1]\n"
    "add x11, x10, %[input_row_stride]\n"
    "add x20, x19, #64\n"
    "add x21, x19, %[input_col_stride1]\n"
    "add x12, x11, %[input_row_stride]\n"
    "add x22, x21, #64\n"
    "add x24, x23, %[output_row_stride]\n"
    "add x25, x24, %[output_row_stride]\n"
    "add x26, %[output_col_stride1], %[output_col_stride1]\n"
    "and x13, %[n_channels], #3\n"
    "add x27, x26, %[output_col_stride1]\n"
    "lsr x14, %[n_channels], #2\n"
    "cbz x14, 4f\n"
    "1:\n"
    "ldr q14, [%[wbptr]]\n"
    "subs x14, x14, #1\n"
    "mov v17.16b, v14.16b\n"
    "ldr q12, [%[wbptr], #16]\n"
    "mov v23.16b, v14.16b\n"
    "ldr q11, [%[wbptr], #32]\n"
    "mov v24.16b, v14.16b\n"
    "ldr q10, [%[wbptr], #48]\n"
    "mov v20.16b, v14.16b\n"
    "ldr q9, [%[wbptr], #64]\n"
    "mov v16.16b, v14.16b\n"
    "ldr q8, [%[wbptr], #80]\n"
    "mov v13.16b, v14.16b\n"
    "ldr q7, [%[wbptr], #96]\n"
    "mov v0.16b, v14.16b\n"
    "ldr q6, [%[wbptr], #112]\n"
    "mov v1.16b, v14.16b\n"
    "ldr q5, [%[wbptr], #128]\n"
    "mov v2.16b, v14.16b\n"
    "ldr q4, [%[wbptr], #144]\n"
    "mov v3.16b, v14.16b\n"
    "ldr q29, [%[inptr0]]\n"
    "fmla v17.4s, v29.4s, v12.4s\n"
    "ldr q28, [x8]\n"
    "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
    "ldr q25, [x9]\n"
    "ldr q26, [x8, %[input_col_stride1]]\n"
    "ldr q27, [%[inptr0], x15]\n"
    "ldr q15, [x10]\n"
    "ldr q18, [x9, %[input_col_stride1]]\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x8, #64]\n"
    "prfm pldl1keep, [%[inptr0], x28]\n"
    "prfm pldl1keep, [x9, #64]\n"
    "prfm pldl1keep, [x8, x28]\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "prfm pldl1keep, [x10, #64]\n"
    "prfm pldl1keep, [x9, x28]\n"
    "beq 3f\n"
    "2:\n"
    "fmla v17.4s, v28.4s, v9.4s\n"
    "prfm pldl1keep, [x8, x16]\n"
    "fmla v23.4s, v28.4s, v12.4s\n"
    "ldr q22, [x8, x15]\n"
    "fmla v24.4s, v30.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], x7]\n"
    "fmla v17.4s, v30.4s, v11.4s\n"
    "ldr q29, [%[inptr0], x17]\n"
    "fmla v23.4s, v25.4s, v9.4s\n"
    "prfm pldl1keep, [x11, #64]\n"
    "fmla v20.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x10, x28]\n"
    "fmla v17.4s, v25.4s, v6.4s\n"
    "ldr q25, [x11]\n"
    "fmla v23.4s, v26.4s, v11.4s\n"
    "prfm pldl1keep, [x9, x16]\n"
    "fmla v24.4s, v26.4s, v9.4s\n"
    "prfm pldl1keep, [x8, x7]\n"
    "fmla v17.4s, v26.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x20]\n"
    "fmla v16.4s, v26.4s, v12.4s\n"
    "ldr q28, [x10, %[input_col_stride1]]\n"
    "fmla v24.4s, v27.4s, v11.4s\n"
    "prfm pldl1keep, [x12, #64]\n"
    "fmla v17.4s, v27.4s, v10.4s\n"
    "prfm pldl1keep, [x11, x28]\n"
    "fmla v13.4s, v27.4s, v12.4s\n"
    "ldr q19, [x9, x15]\n"
    "fmla v23.4s, v15.4s, v6.4s\n"
    "prfm pldl1keep, [x10, x16]\n"
    "fmla v20.4s, v15.4s, v9.4s\n"
    "prfm pldl1keep, [x9, x7]\n"
    "fmla v0.4s, v15.4s, v12.4s\n"
    "ldr q21, [x8, x17]\n"
    "fmla v17.4s, v18.4s, v5.4s\n"
    "prfm pldl1keep, [x8, x20]\n"
    "fmla v23.4s, v18.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x22]\n"
    "fmla v24.4s, v18.4s, v6.4s\n"
    "prfm pldl1keep, [x12, x28]\n"
    "fmla v20.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x11, x16]\n"
    "fmla v16.4s, v18.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x7]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "ldr q27, [%[inptr0], x19]\n"
    "fmla v17.4s, v22.4s, v7.4s\n"
    "prfm pldl1keep, [x9, x20]\n"
    "fmla v23.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [x8, x22]\n"
    "fmla v24.4s, v22.4s, v8.4s\n"
    "prfm pldl1keep, [x12, x16]\n"
    "fmla v16.4s, v22.4s, v11.4s\n"
    "prfm pldl1keep, [x11, x7]\n"
    "fmla v13.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x20]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "ldr q18, [x12]\n"
    "fmla v24.4s, v29.4s, v10.4s\n"
    "prfm pldl1keep, [x9, x22]\n"
    "fmla v13.4s, v29.4s, v11.4s\n"
    "prfm pldl1keep, [x12, x7]\n"
    "fmla v3.4s, v29.4s, v12.4s\n"
    "ldr q22, [x11, %[input_col_stride1]]\n"
    "fmla v20.4s, v25.4s, v6.4s\n"
    "prfm pldl1keep, [x11, x20]\n"
    "fmla v0.4s, v25.4s, v9.4s\n"
    "ldr q25, [x10, x15]\n"
    "fmla v23.4s, v28.4s, v5.4s\n"
    "prfm pldl1keep, [x10, x22]\n"
    "fmla v20.4s, v28.4s, v8.4s\n"
    "prfm pldl1keep, [x12, x20]\n"
    "fmla v16.4s, v28.4s, v6.4s\n"
    "prfm pldl1keep, [x11, x22]\n"
    "fmla v0.4s, v28.4s, v11.4s\n"
    "prfm pldl1keep, [x12, x22]\n"
    "fmla v1.4s, v28.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v17.4s, v19.4s, v4.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v23.4s, v19.4s, v7.4s\n"
    "subs x14, x14, #1\n"
    "fmla v24.4s, v19.4s, v5.4s\n"
    "fmla v20.4s, v19.4s, v10.4s\n"
    "str q17, [%[outptr0]]\n"
    "mov v15.16b, v14.16b\n"
    "fmla v16.4s, v19.4s, v8.4s\n"
    "fmla v13.4s, v19.4s, v6.4s\n"
    "fmla v15.4s, v28.4s, v12.4s\n"
    "ldr q29, [x9, x17]\n"
    "fmla v1.4s, v19.4s, v11.4s\n"
    "fmla v2.4s, v19.4s, v9.4s\n"
    "fmla v24.4s, v21.4s, v7.4s\n"
    "fmla v16.4s, v21.4s, v10.4s\n"
    "fmla v13.4s, v21.4s, v8.4s\n"
    "fmla v3.4s, v21.4s, v9.4s\n"
    "fmla v2.4s, v21.4s, v11.4s\n"
    "fmla v0.4s, v18.4s, v6.4s\n"
    "mov v18.16b, v14.16b\n"
    "fmla v20.4s, v22.4s, v5.4s\n"
    "fmla v13.4s, v27.4s, v10.4s\n"
    "fmla v3.4s, v27.4s, v11.4s\n"
    "mov v17.16b, v14.16b\n"
    "fmla v18.4s, v19.4s, v12.4s\n"
    "mov v19.16b, v14.16b\n"
    "fmla v0.4s, v22.4s, v8.4s\n"
    "fmla v17.4s, v21.4s, v12.4s\n"
    "ldr q26, [x8, x19]\n"
    "fmla v1.4s, v22.4s, v6.4s\n"
    "fmla v15.4s, v22.4s, v9.4s\n"
    "mov v22.16b, v14.16b\n"
    "mov v21.16b, v14.16b\n"
    "fmla v23.4s, v25.4s, v4.4s\n"
    "fmla v20.4s, v25.4s, v7.4s\n"
    "fmla v16.4s, v25.4s, v5.4s\n"
    "fmla v0.4s, v25.4s, v10.4s\n"
    "fmla v1.4s, v25.4s, v8.4s\n"
    "fmla v2.4s, v25.4s, v6.4s\n"
    "str q23, [x23]\n"
    "fmla v15.4s, v25.4s, v11.4s\n"
    "fmla v18.4s, v25.4s, v9.4s\n"
    "ldr q28, [%[inptr0], x21]\n"
    "fmla v19.4s, v25.4s, v12.4s\n"
    "ldr q30, [x12, %[input_col_stride1]]\n"
    "fmla v24.4s, v29.4s, v4.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v16.4s, v29.4s, v7.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v13.4s, v29.4s, v5.4s\n"
    "prfm pldl1keep, [%[inptr0], x28]\n"
    "str q24, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v1.4s, v29.4s, v10.4s\n"
    "fmla v2.4s, v29.4s, v8.4s\n"
    "ldr q27, [x11, x15]\n"
    "fmla v3.4s, v29.4s, v6.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v18.4s, v29.4s, v11.4s\n"
    "fmla v17.4s, v29.4s, v9.4s\n"
    "fmla v22.4s, v29.4s, v12.4s\n"
    "ldr q23, [x10, x17]\n"
    "fmla v13.4s, v26.4s, v7.4s\n"
    "fmla v2.4s, v26.4s, v10.4s\n"
    "fmla v3.4s, v26.4s, v8.4s\n"
    "fmla v17.4s, v26.4s, v11.4s\n"
    "fmla v0.4s, v30.4s, v5.4s\n"
    "ldr q24, [x9, x19]\n"
    "fmla v15.4s, v30.4s, v6.4s\n"
    "ldr q29, [x8, x21]\n"
    "fmla v3.4s, v28.4s, v10.4s\n"
    "ldr q14, [x12, x15]\n"
    "fmla v20.4s, v27.4s, v4.4s\n"
    "add x8, x8, #16\n"
    "fmla v0.4s, v27.4s, v7.4s\n"
    "prfm pldl1keep, [x8, #64]\n"
    "fmla v1.4s, v27.4s, v5.4s\n"
    "prfm pldl1keep, [x8, x28]\n"
    "str q20, [x24]\n"
    "fmla v15.4s, v27.4s, v8.4s\n"
    "fmla v18.4s, v27.4s, v6.4s\n"
    "ldr q25, [x11, x17]\n"
    "fmla v19.4s, v27.4s, v9.4s\n"
    "ldr q30, [x10, x19]\n"
    "fmla v16.4s, v23.4s, v4.4s\n"
    "fmla v1.4s, v23.4s, v7.4s\n"
    "fmla v2.4s, v23.4s, v5.4s\n"
    "fmla v15.4s, v23.4s, v10.4s\n"
    "fmla v18.4s, v23.4s, v8.4s\n"
    "fmla v17.4s, v23.4s, v6.4s\n"
    "str q16, [x23, %[output_col_stride1]]\n"
    "fmla v19.4s, v23.4s, v11.4s\n"
    "fmla v22.4s, v23.4s, v9.4s\n"
    "ldr q26, [x9, x21]\n"
    "fmla v21.4s, v23.4s, v12.4s\n"
    "ldr q27, [x12, x17]\n"
    "fmla v13.4s, v24.4s, v4.4s\n"
    "ldr q20, [x11, x19]\n"
    "fmla v2.4s, v24.4s, v7.4s\n"
    "add x9, x9, #16\n"
    "fmla v3.4s, v24.4s, v5.4s\n"
    "prfm pldl1keep, [x9, #64]\n"
    "str q13, [%[outptr0], x26]\n"
    "fmla v18.4s, v24.4s, v10.4s\n"
    "fmla v17.4s, v24.4s, v8.4s\n"
    "ldr q23, [x10, x21]\n"
    "fmla v22.4s, v24.4s, v11.4s\n"
    "ldr q24, [x12, x19]\n"
    "fmla v3.4s, v29.4s, v7.4s\n"
    "prfm pldl1keep, [x9, x28]\n"
    "fmla v17.4s, v29.4s, v10.4s\n"
    "ldr q16, [x11, x21]\n"
    "fmla v0.4s, v14.4s, v4.4s\n"
    "add x10, x10, #16\n"
    "fmla v15.4s, v14.4s, v5.4s\n"
    "prfm pldl1keep, [x10, #64]\n"
    "fmla v19.4s, v14.4s, v6.4s\n"
    "ldr q13, [x12, x21]\n"
    "str q0, [x25]\n"
    "fmla v1.4s, v25.4s, v4.4s\n"
    "fmla v15.4s, v25.4s, v7.4s\n"
    "ldr q14, [%[wbptr]]\n"
    "fmla v18.4s, v25.4s, v5.4s\n"
    "add x11, x11, #16\n"
    "str q1, [x24, %[output_col_stride1]]\n"
    "fmla v19.4s, v25.4s, v8.4s\n"
    "fmla v22.4s, v25.4s, v6.4s\n"
    "ldr q12, [%[wbptr], #16]\n"
    "fmla v21.4s, v25.4s, v9.4s\n"
    "ldr q29, [%[inptr0]]\n"
    "fmla v2.4s, v30.4s, v4.4s\n"
    "ldr q28, [x8]\n"
    "fmla v18.4s, v30.4s, v7.4s\n"
    "add x12, x12, #16\n"
    "fmla v17.4s, v30.4s, v5.4s\n"
    "fmla v19.4s, v30.4s, v10.4s\n"
    "str q2, [x23, x26]\n"
    "fmla v22.4s, v30.4s, v8.4s\n"
    "fmla v21.4s, v30.4s, v11.4s\n"
    "ldr q9, [%[wbptr], #64]\n"
    "fmla v3.4s, v26.4s, v4.4s\n"
    "ldr q30, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v17.4s, v26.4s, v7.4s\n"
    "ldr q25, [x9]\n"
    "fmla v22.4s, v26.4s, v10.4s\n"
    "ldr q11, [%[wbptr], #32]\n"
    "str q3, [%[outptr0], x27]\n"
    "fmla v15.4s, v27.4s, v4.4s\n"
    "fmla v19.4s, v27.4s, v5.4s\n"
    "ldr q26, [x8, %[input_col_stride1]]\n"
    "fmla v21.4s, v27.4s, v6.4s\n"
    "ldr q27, [%[inptr0], x15]\n"
    "str q15, [x25, %[output_col_stride1]]\n"
    "fmla v18.4s, v20.4s, v4.4s\n"
    "fmla v19.4s, v20.4s, v7.4s\n"
    "ldr q15, [x10]\n"
    "fmla v22.4s, v20.4s, v5.4s\n"
    "ldr q6, [%[wbptr], #112]\n"
    "str q18, [x24, x26]\n"
    "fmla v21.4s, v20.4s, v8.4s\n"
    "fmla v17.4s, v23.4s, v4.4s\n"
    "ldr q18, [x9, %[input_col_stride1]]\n"
    "fmla v22.4s, v23.4s, v7.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmla v21.4s, v23.4s, v10.4s\n"
    "ldr q8, [%[wbptr], #80]\n"
    "str q17, [x23, x27]\n"
    "fmla v19.4s, v24.4s, v4.4s\n"
    "fmla v22.4s, v16.4s, v4.4s\n"
    "add x23, x23, #16\n"
    "fmla v21.4s, v24.4s, v5.4s\n"
    "ldr q10, [%[wbptr], #48]\n"
    "str q19, [x25, x26]\n"
    "mov v17.16b, v14.16b\n"
    "str q22, [x24, x27]\n"
    "mov v23.16b, v14.16b\n"
    "fmla v21.4s, v16.4s, v7.4s\n"
    "ldr q5, [%[wbptr], #128]\n"
    "mov v24.16b, v14.16b\n"
    "add x24, x24, #16\n"
    "mov v20.16b, v14.16b\n"
    "mov v16.16b, v14.16b\n"
    "fmla v21.4s, v13.4s, v4.4s\n"
    "ldr q7, [%[wbptr], #96]\n"
    "mov v13.16b, v14.16b\n"
    "mov v0.16b, v14.16b\n"
    "mov v1.16b, v14.16b\n"
    "mov v2.16b, v14.16b\n"
    "str q21, [x25, x27]\n"
    "mov v3.16b, v14.16b\n"
    "ldr q4, [%[wbptr], #144]\n"
    "add x25, x25, #16\n"
    "fmla v17.4s, v29.4s, v12.4s\n"
    "bne 2b\n"
    "3:\n"
    "fmla v17.4s, v28.4s, v9.4s\n"
    "prfm pldl1keep, [x8, x16]\n"
    "fmla v23.4s, v28.4s, v12.4s\n"
    "ldr q22, [x8, x15]\n"
    "fmla v24.4s, v30.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], x7]\n"
    "fmla v17.4s, v30.4s, v11.4s\n"
    "ldr q29, [%[inptr0], x17]\n"
    "fmla v23.4s, v25.4s, v9.4s\n"
    "prfm pldl1keep, [x11, #64]\n"
    "fmla v20.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x10, x28]\n"
    "fmla v17.4s, v25.4s, v6.4s\n"
    "ldr q25, [x11]\n"
    "fmla v23.4s, v26.4s, v11.4s\n"
    "prfm pldl1keep, [x9, x16]\n"
    "fmla v24.4s, v26.4s, v9.4s\n"
    "prfm pldl1keep, [x8, x7]\n"
    "fmla v17.4s, v26.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x20]\n"
    "fmla v16.4s, v26.4s, v12.4s\n"
    "ldr q28, [x10, %[input_col_stride1]]\n"
    "fmla v24.4s, v27.4s, v11.4s\n"
    "prfm pldl1keep, [x12, #64]\n"
    "fmla v17.4s, v27.4s, v10.4s\n"
    "prfm pldl1keep, [x11, x28]\n"
    "fmla v13.4s, v27.4s, v12.4s\n"
    "ldr q19, [x9, x15]\n"
    "fmla v23.4s, v15.4s, v6.4s\n"
    "prfm pldl1keep, [x10, x16]\n"
    "fmla v20.4s, v15.4s, v9.4s\n"
    "prfm pldl1keep, [x9, x7]\n"
    "fmla v0.4s, v15.4s, v12.4s\n"
    "ldr q21, [x8, x17]\n"
    "fmla v17.4s, v18.4s, v5.4s\n"
    "prfm pldl1keep, [x8, x20]\n"
    "fmla v23.4s, v18.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x22]\n"
    "fmla v24.4s, v18.4s, v6.4s\n"
    "prfm pldl1keep, [x12, x28]\n"
    "fmla v20.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x11, x16]\n"
    "fmla v16.4s, v18.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x7]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "ldr q27, [%[inptr0], x19]\n"
    "fmla v17.4s, v22.4s, v7.4s\n"
    "prfm pldl1keep, [x9, x20]\n"
    "fmla v23.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [x8, x22]\n"
    "fmla v24.4s, v22.4s, v8.4s\n"
    "prfm pldl1keep, [x12, x16]\n"
    "fmla v16.4s, v22.4s, v11.4s\n"
    "prfm pldl1keep, [x11, x7]\n"
    "fmla v13.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x20]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "ldr q18, [x12]\n"
    "fmla v24.4s, v29.4s, v10.4s\n"
    "prfm pldl1keep, [x9, x22]\n"
    "fmla v13.4s, v29.4s, v11.4s\n"
    "prfm pldl1keep, [x12, x7]\n"
    "fmla v3.4s, v29.4s, v12.4s\n"
    "ldr q22, [x11, %[input_col_stride1]]\n"
    "fmla v20.4s, v25.4s, v6.4s\n"
    "prfm pldl1keep, [x11, x20]\n"
    "fmla v0.4s, v25.4s, v9.4s\n"
    "ldr q25, [x10, x15]\n"
    "fmla v23.4s, v28.4s, v5.4s\n"
    "prfm pldl1keep, [x10, x22]\n"
    "fmla v20.4s, v28.4s, v8.4s\n"
    "prfm pldl1keep, [x12, x20]\n"
    "fmla v16.4s, v28.4s, v6.4s\n"
    "prfm pldl1keep, [x11, x22]\n"
    "fmla v0.4s, v28.4s, v11.4s\n"
    "prfm pldl1keep, [x12, x22]\n"
    "fmla v1.4s, v28.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v17.4s, v19.4s, v4.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v23.4s, v19.4s, v7.4s\n"
    "fmla v24.4s, v19.4s, v5.4s\n"
    "fmla v20.4s, v19.4s, v10.4s\n"
    "fmla v16.4s, v19.4s, v8.4s\n"
    "str q17, [%[outptr0]]\n"
    "mov v15.16b, v14.16b\n"
    "fmla v13.4s, v19.4s, v6.4s\n"
    "fmla v1.4s, v19.4s, v11.4s\n"
    "fmla v15.4s, v28.4s, v12.4s\n"
    "ldr q29, [x9, x17]\n"
    "fmla v2.4s, v19.4s, v9.4s\n"
    "fmla v24.4s, v21.4s, v7.4s\n"
    "fmla v16.4s, v21.4s, v10.4s\n"
    "fmla v13.4s, v21.4s, v8.4s\n"
    "fmla v3.4s, v21.4s, v9.4s\n"
    "fmla v0.4s, v18.4s, v6.4s\n"
    "mov v18.16b, v14.16b\n"
    "fmla v2.4s, v21.4s, v11.4s\n"
    "fmla v13.4s, v27.4s, v10.4s\n"
    "fmla v20.4s, v22.4s, v5.4s\n"
    "fmla v18.4s, v19.4s, v12.4s\n"
    "ldr q26, [x8, x19]\n"
    "fmla v3.4s, v27.4s, v11.4s\n"
    "ldr q28, [%[inptr0], x21]\n"
    "fmla v0.4s, v22.4s, v8.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v1.4s, v22.4s, v6.4s\n"
    "fmla v15.4s, v22.4s, v9.4s\n"
    "mov v17.16b, v14.16b\n"
    "fmla v23.4s, v25.4s, v4.4s\n"
    "fmla v20.4s, v25.4s, v7.4s\n"
    "fmla v16.4s, v25.4s, v5.4s\n"
    "fmla v17.4s, v21.4s, v12.4s\n"
    "ldr q30, [x12, %[input_col_stride1]]\n"
    "str q23, [x23]\n"
    "mov v19.16b, v14.16b\n"
    "fmla v0.4s, v25.4s, v10.4s\n"
    "fmla v1.4s, v25.4s, v8.4s\n"
    "fmla v2.4s, v25.4s, v6.4s\n"
    "fmla v15.4s, v25.4s, v11.4s\n"
    "fmla v18.4s, v25.4s, v9.4s\n"
    "fmla v19.4s, v25.4s, v12.4s\n"
    "mov v22.16b, v14.16b\n"
    "mov v21.16b, v14.16b\n"
    "fmla v24.4s, v29.4s, v4.4s\n"
    "fmla v16.4s, v29.4s, v7.4s\n"
    "fmla v13.4s, v29.4s, v5.4s\n"
    "fmla v1.4s, v29.4s, v10.4s\n"
    "fmla v2.4s, v29.4s, v8.4s\n"
    "fmla v3.4s, v29.4s, v6.4s\n"
    "str q24, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v18.4s, v29.4s, v11.4s\n"
    "fmla v17.4s, v29.4s, v9.4s\n"
    "ldr q27, [x11, x15]\n"
    "fmla v22.4s, v29.4s, v12.4s\n"
    "ldr q23, [x10, x17]\n"
    "fmla v13.4s, v26.4s, v7.4s\n"
    "fmla v2.4s, v26.4s, v10.4s\n"
    "fmla v3.4s, v26.4s, v8.4s\n"
    "fmla v17.4s, v26.4s, v11.4s\n"
    "fmla v0.4s, v30.4s, v5.4s\n"
    "ldr q24, [x9, x19]\n"
    "fmla v15.4s, v30.4s, v6.4s\n"
    "ldr q29, [x8, x21]\n"
    "fmla v3.4s, v28.4s, v10.4s\n"
    "ldr q14, [x12, x15]\n"
    "fmla v20.4s, v27.4s, v4.4s\n"
    "add x8, x8, #16\n"
    "fmla v0.4s, v27.4s, v7.4s\n"
    "fmla v1.4s, v27.4s, v5.4s\n"
    "fmla v15.4s, v27.4s, v8.4s\n"
    "fmla v18.4s, v27.4s, v6.4s\n"
    "str q20, [x24]\n"
    "fmla v19.4s, v27.4s, v9.4s\n"
    "fmla v16.4s, v23.4s, v4.4s\n"
    "ldr q25, [x11, x17]\n"
    "fmla v1.4s, v23.4s, v7.4s\n"
    "ldr q30, [x10, x19]\n"
    "fmla v2.4s, v23.4s, v5.4s\n"
    "fmla v15.4s, v23.4s, v10.4s\n"
    "str q16, [x23, %[output_col_stride1]]\n"
    "fmla v18.4s, v23.4s, v8.4s\n"
    "fmla v17.4s, v23.4s, v6.4s\n"
    "ldr q26, [x9, x21]\n"
    "fmla v19.4s, v23.4s, v11.4s\n"
    "add x9, x9, #16\n"
    "fmla v22.4s, v23.4s, v9.4s\n"
    "fmla v21.4s, v23.4s, v12.4s\n"
    "fmla v13.4s, v24.4s, v4.4s\n"
    "ldr q27, [x12, x17]\n"
    "fmla v2.4s, v24.4s, v7.4s\n"
    "ldr q20, [x11, x19]\n"
    "fmla v3.4s, v24.4s, v5.4s\n"
    "fmla v18.4s, v24.4s, v10.4s\n"
    "str q13, [%[outptr0], x26]\n"
    "fmla v17.4s, v24.4s, v8.4s\n"
    "fmla v22.4s, v24.4s, v11.4s\n"
    "ldr q23, [x10, x21]\n"
    "fmla v3.4s, v29.4s, v7.4s\n"
    "ldr q24, [x12, x19]\n"
    "fmla v17.4s, v29.4s, v10.4s\n"
    "ldr q16, [x11, x21]\n"
    "fmla v0.4s, v14.4s, v4.4s\n"
    "add x10, x10, #16\n"
    "fmla v15.4s, v14.4s, v5.4s\n"
    "add x11, x11, #16\n"
    "fmla v19.4s, v14.4s, v6.4s\n"
    "ldr q13, [x12, x21]\n"
    "str q0, [x25]\n"
    "fmla v1.4s, v25.4s, v4.4s\n"
    "fmla v15.4s, v25.4s, v7.4s\n"
    "add x12, x12, #16\n"
    "fmla v18.4s, v25.4s, v5.4s\n"
    "fmla v19.4s, v25.4s, v8.4s\n"
    "str q1, [x24, %[output_col_stride1]]\n"
    "fmla v22.4s, v25.4s, v6.4s\n"
    "fmla v21.4s, v25.4s, v9.4s\n"
    "fmla v2.4s, v30.4s, v4.4s\n"
    "fmla v18.4s, v30.4s, v7.4s\n"
    "fmla v17.4s, v30.4s, v5.4s\n"
    "fmla v19.4s, v30.4s, v10.4s\n"
    "fmla v22.4s, v30.4s, v8.4s\n"
    "str q2, [x23, x26]\n"
    "fmla v21.4s, v30.4s, v11.4s\n"
    "fmla v3.4s, v26.4s, v4.4s\n"
    "fmla v17.4s, v26.4s, v7.4s\n"
    "fmla v22.4s, v26.4s, v10.4s\n"
    "fmla v15.4s, v27.4s, v4.4s\n"
    "fmla v19.4s, v27.4s, v5.4s\n"
    "fmla v21.4s, v27.4s, v6.4s\n"
    "str q3, [%[outptr0], x27]\n"
    "fmla v18.4s, v20.4s, v4.4s\n"
    "str q15, [x25, %[output_col_stride1]]\n"
    "fmla v22.4s, v20.4s, v5.4s\n"
    "fmla v19.4s, v20.4s, v7.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "str q18, [x24, x26]\n"
    "fmla v21.4s, v20.4s, v8.4s\n"
    "fmla v17.4s, v23.4s, v4.4s\n"
    "fmla v22.4s, v23.4s, v7.4s\n"
    "fmla v19.4s, v24.4s, v4.4s\n"
    "fmla v21.4s, v23.4s, v10.4s\n"
    "str q17, [x23, x27]\n"
    "fmla v22.4s, v16.4s, v4.4s\n"
    "str q19, [x25, x26]\n"
    "add x23, x23, #16\n"
    "fmla v21.4s, v24.4s, v5.4s\n"
    "str q22, [x24, x27]\n"
    "add x24, x24, #16\n"
    "fmla v21.4s, v16.4s, v7.4s\n"
    "fmla v21.4s, v13.4s, v4.4s\n"
    "str q21, [x25, x27]\n"
    "add x25, x25, #16\n"
    "4:\n"
    "cbz x13, 7f\n"
    "ldr s14, [%[wbptr]]\n"
    "mov v17.16b, v14.16b\n"
    "ldr s12, [%[wbptr], #4]\n"
    "mov v23.16b, v14.16b\n"
    "ldr s11, [%[wbptr], #8]\n"
    "mov v24.16b, v14.16b\n"
    "ldr s10, [%[wbptr], #12]\n"
    "mov v20.16b, v14.16b\n"
    "ldr s9, [%[wbptr], #16]\n"
    "mov v16.16b, v14.16b\n"
    "ldr s8, [%[wbptr], #20]\n"
    "mov v13.16b, v14.16b\n"
    "ldr s7, [%[wbptr], #24]\n"
    "mov v0.16b, v14.16b\n"
    "ldr s6, [%[wbptr], #28]\n"
    "mov v1.16b, v14.16b\n"
    "ldr s5, [%[wbptr], #32]\n"
    "mov v2.16b, v14.16b\n"
    "ldr s4, [%[wbptr], #36]\n"
    "mov v3.16b, v14.16b\n"
    "ldr s29, [%[inptr0]]\n"
    "fmla v17.4s, v29.4s, v12.4s\n"
    "ldr s28, [x8]\n"
    "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
    "subs x13, x13, #1\n"
    "ldr s25, [x9]\n"
    "ldr s26, [x8, %[input_col_stride1]]\n"
    "ldr s27, [%[inptr0], x15]\n"
    "ldr s15, [x10]\n"
    "ldr s18, [x9, %[input_col_stride1]]\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x8, #64]\n"
    "prfm pldl1keep, [%[inptr0], x28]\n"
    "prfm pldl1keep, [x9, #64]\n"
    "prfm pldl1keep, [x8, x28]\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "prfm pldl1keep, [x10, #64]\n"
    "prfm pldl1keep, [x9, x28]\n"
    "beq 6f\n"
    "5:\n"
    "fmla v17.4s, v28.4s, v9.4s\n"
    "prfm pldl1keep, [x8, x16]\n"
    "fmla v23.4s, v28.4s, v12.4s\n"
    "ldr s22, [x8, x15]\n"
    "fmla v24.4s, v30.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], x7]\n"
    "fmla v17.4s, v30.4s, v11.4s\n"
    "ldr s29, [%[inptr0], x17]\n"
    "fmla v23.4s, v25.4s, v9.4s\n"
    "prfm pldl1keep, [x11, #64]\n"
    "fmla v20.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x10, x28]\n"
    "fmla v17.4s, v25.4s, v6.4s\n"
    "ldr s25, [x11]\n"
    "fmla v23.4s, v26.4s, v11.4s\n"
    "prfm pldl1keep, [x9, x16]\n"
    "fmla v24.4s, v26.4s, v9.4s\n"
    "prfm pldl1keep, [x8, x7]\n"
    "fmla v17.4s, v26.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x20]\n"
    "fmla v16.4s, v26.4s, v12.4s\n"
    "ldr s28, [x10, %[input_col_stride1]]\n"
    "fmla v24.4s, v27.4s, v11.4s\n"
    "prfm pldl1keep, [x12, #64]\n"
    "fmla v17.4s, v27.4s, v10.4s\n"
    "prfm pldl1keep, [x11, x28]\n"
    "fmla v13.4s, v27.4s, v12.4s\n"
    "ldr s19, [x9, x15]\n"
    "fmla v23.4s, v15.4s, v6.4s\n"
    "prfm pldl1keep, [x10, x16]\n"
    "fmla v20.4s, v15.4s, v9.4s\n"
    "prfm pldl1keep, [x9, x7]\n"
    "fmla v0.4s, v15.4s, v12.4s\n"
    "ldr s21, [x8, x17]\n"
    "fmla v17.4s, v18.4s, v5.4s\n"
    "prfm pldl1keep, [x8, x20]\n"
    "fmla v23.4s, v18.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x22]\n"
    "fmla v24.4s, v18.4s, v6.4s\n"
    "prfm pldl1keep, [x12, x28]\n"
    "fmla v20.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x11, x16]\n"
    "fmla v16.4s, v18.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x7]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "ldr s27, [%[inptr0], x19]\n"
    "fmla v17.4s, v22.4s, v7.4s\n"
    "prfm pldl1keep, [x9, x20]\n"
    "fmla v23.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [x8, x22]\n"
    "fmla v24.4s, v22.4s, v8.4s\n"
    "prfm pldl1keep, [x12, x16]\n"
    "fmla v16.4s, v22.4s, v11.4s\n"
    "prfm pldl1keep, [x11, x7]\n"
    "fmla v13.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x20]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "ldr s18, [x12]\n"
    "fmla v24.4s, v29.4s, v10.4s\n"
    "prfm pldl1keep, [x9, x22]\n"
    "fmla v13.4s, v29.4s, v11.4s\n"
    "prfm pldl1keep, [x12, x7]\n"
    "fmla v3.4s, v29.4s, v12.4s\n"
    "ldr s22, [x11, %[input_col_stride1]]\n"
    "fmla v20.4s, v25.4s, v6.4s\n"
    "prfm pldl1keep, [x11, x20]\n"
    "fmla v0.4s, v25.4s, v9.4s\n"
    "ldr s25, [x10, x15]\n"
    "fmla v23.4s, v28.4s, v5.4s\n"
    "prfm pldl1keep, [x10, x22]\n"
    "fmla v20.4s, v28.4s, v8.4s\n"
    "prfm pldl1keep, [x12, x20]\n"
    "fmla v16.4s, v28.4s, v6.4s\n"
    "prfm pldl1keep, [x11, x22]\n"
    "fmla v0.4s, v28.4s, v11.4s\n"
    "prfm pldl1keep, [x12, x22]\n"
    "fmla v1.4s, v28.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v17.4s, v19.4s, v4.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v23.4s, v19.4s, v7.4s\n"
    "subs x13, x13, #1\n"
    "fmla v24.4s, v19.4s, v5.4s\n"
    "fmla v20.4s, v19.4s, v10.4s\n"
    "str s17, [%[outptr0]]\n"
    "mov v15.16b, v14.16b\n"
    "fmla v16.4s, v19.4s, v8.4s\n"
    "fmla v13.4s, v19.4s, v6.4s\n"
    "fmla v15.4s, v28.4s, v12.4s\n"
    "ldr s29, [x9, x17]\n"
    "fmla v1.4s, v19.4s, v11.4s\n"
    "fmla v2.4s, v19.4s, v9.4s\n"
    "fmla v24.4s, v21.4s, v7.4s\n"
    "fmla v16.4s, v21.4s, v10.4s\n"
    "fmla v13.4s, v21.4s, v8.4s\n"
    "fmla v3.4s, v21.4s, v9.4s\n"
    "fmla v2.4s, v21.4s, v11.4s\n"
    "fmla v0.4s, v18.4s, v6.4s\n"
    "mov v18.16b, v14.16b\n"
    "fmla v20.4s, v22.4s, v5.4s\n"
    "fmla v13.4s, v27.4s, v10.4s\n"
    "fmla v3.4s, v27.4s, v11.4s\n"
    "mov v17.16b, v14.16b\n"
    "fmla v18.4s, v19.4s, v12.4s\n"
    "mov v19.16b, v14.16b\n"
    "fmla v0.4s, v22.4s, v8.4s\n"
    "fmla v17.4s, v21.4s, v12.4s\n"
    "ldr s26, [x8, x19]\n"
    "fmla v1.4s, v22.4s, v6.4s\n"
    "fmla v15.4s, v22.4s, v9.4s\n"
    "mov v22.16b, v14.16b\n"
    "mov v21.16b, v14.16b\n"
    "fmla v23.4s, v25.4s, v4.4s\n"
    "fmla v20.4s, v25.4s, v7.4s\n"
    "fmla v16.4s, v25.4s, v5.4s\n"
    "fmla v0.4s, v25.4s, v10.4s\n"
    "fmla v1.4s, v25.4s, v8.4s\n"
    "fmla v2.4s, v25.4s, v6.4s\n"
    "str s23, [x23]\n"
    "fmla v15.4s, v25.4s, v11.4s\n"
    "fmla v18.4s, v25.4s, v9.4s\n"
    "ldr s28, [%[inptr0], x21]\n"
    "fmla v19.4s, v25.4s, v12.4s\n"
    "ldr s30, [x12, %[input_col_stride1]]\n"
    "fmla v24.4s, v29.4s, v4.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v16.4s, v29.4s, v7.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v13.4s, v29.4s, v5.4s\n"
    "prfm pldl1keep, [%[inptr0], x28]\n"
    "str s24, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v1.4s, v29.4s, v10.4s\n"
    "fmla v2.4s, v29.4s, v8.4s\n"
    "ldr s27, [x11, x15]\n"
    "fmla v3.4s, v29.4s, v6.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v18.4s, v29.4s, v11.4s\n"
    "fmla v17.4s, v29.4s, v9.4s\n"
    "fmla v22.4s, v29.4s, v12.4s\n"
    "ldr s23, [x10, x17]\n"
    "fmla v13.4s, v26.4s, v7.4s\n"
    "fmla v2.4s, v26.4s, v10.4s\n"
    "fmla v3.4s, v26.4s, v8.4s\n"
    "fmla v17.4s, v26.4s, v11.4s\n"
    "fmla v0.4s, v30.4s, v5.4s\n"
    "ldr s24, [x9, x19]\n"
    "fmla v15.4s, v30.4s, v6.4s\n"
    "ldr s29, [x8, x21]\n"
    "fmla v3.4s, v28.4s, v10.4s\n"
    "ldr s14, [x12, x15]\n"
    "fmla v20.4s, v27.4s, v4.4s\n"
    "add x8, x8, #4\n"
    "fmla v0.4s, v27.4s, v7.4s\n"
    "prfm pldl1keep, [x8, #64]\n"
    "fmla v1.4s, v27.4s, v5.4s\n"
    "prfm pldl1keep, [x8, x28]\n"
    "str s20, [x24]\n"
    "fmla v15.4s, v27.4s, v8.4s\n"
    "fmla v18.4s, v27.4s, v6.4s\n"
    "ldr s25, [x11, x17]\n"
    "fmla v19.4s, v27.4s, v9.4s\n"
    "ldr s30, [x10, x19]\n"
    "fmla v16.4s, v23.4s, v4.4s\n"
    "fmla v1.4s, v23.4s, v7.4s\n"
    "fmla v2.4s, v23.4s, v5.4s\n"
    "fmla v15.4s, v23.4s, v10.4s\n"
    "fmla v18.4s, v23.4s, v8.4s\n"
    "fmla v17.4s, v23.4s, v6.4s\n"
    "str s16, [x23, %[output_col_stride1]]\n"
    "fmla v19.4s, v23.4s, v11.4s\n"
    "fmla v22.4s, v23.4s, v9.4s\n"
    "ldr s26, [x9, x21]\n"
    "fmla v21.4s, v23.4s, v12.4s\n"
    "ldr s27, [x12, x17]\n"
    "fmla v13.4s, v24.4s, v4.4s\n"
    "ldr s20, [x11, x19]\n"
    "fmla v2.4s, v24.4s, v7.4s\n"
    "add x9, x9, #4\n"
    "fmla v3.4s, v24.4s, v5.4s\n"
    "prfm pldl1keep, [x9, #64]\n"
    "str s13, [%[outptr0], x26]\n"
    "fmla v18.4s, v24.4s, v10.4s\n"
    "fmla v17.4s, v24.4s, v8.4s\n"
    "ldr s23, [x10, x21]\n"
    "fmla v22.4s, v24.4s, v11.4s\n"
    "ldr s24, [x12, x19]\n"
    "fmla v3.4s, v29.4s, v7.4s\n"
    "prfm pldl1keep, [x9, x28]\n"
    "fmla v17.4s, v29.4s, v10.4s\n"
    "ldr s16, [x11, x21]\n"
    "fmla v0.4s, v14.4s, v4.4s\n"
    "add x10, x10, #4\n"
    "fmla v15.4s, v14.4s, v5.4s\n"
    "prfm pldl1keep, [x10, #64]\n"
    "fmla v19.4s, v14.4s, v6.4s\n"
    "ldr s13, [x12, x21]\n"
    "str s0, [x25]\n"
    "fmla v1.4s, v25.4s, v4.4s\n"
    "fmla v15.4s, v25.4s, v7.4s\n"
    "ldr s14, [%[wbptr]]\n"
    "fmla v18.4s, v25.4s, v5.4s\n"
    "add x11, x11, #4\n"
    "str s1, [x24, %[output_col_stride1]]\n"
    "fmla v19.4s, v25.4s, v8.4s\n"
    "fmla v22.4s, v25.4s, v6.4s\n"
    "ldr s12, [%[wbptr], #4]\n"
    "fmla v21.4s, v25.4s, v9.4s\n"
    "ldr s29, [%[inptr0]]\n"
    "fmla v2.4s, v30.4s, v4.4s\n"
    "ldr s28, [x8]\n"
    "fmla v18.4s, v30.4s, v7.4s\n"
    "add x12, x12, #4\n"
    "fmla v17.4s, v30.4s, v5.4s\n"
    "fmla v19.4s, v30.4s, v10.4s\n"
    "str s2, [x23, x26]\n"
    "fmla v22.4s, v30.4s, v8.4s\n"
    "fmla v21.4s, v30.4s, v11.4s\n"
    "ldr s9, [%[wbptr], #16]\n"
    "fmla v3.4s, v26.4s, v4.4s\n"
    "ldr s30, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v17.4s, v26.4s, v7.4s\n"
    "ldr s25, [x9]\n"
    "fmla v22.4s, v26.4s, v10.4s\n"
    "ldr s11, [%[wbptr], #8]\n"
    "str s3, [%[outptr0], x27]\n"
    "fmla v15.4s, v27.4s, v4.4s\n"
    "fmla v19.4s, v27.4s, v5.4s\n"
    "ldr s26, [x8, %[input_col_stride1]]\n"
    "fmla v21.4s, v27.4s, v6.4s\n"
    "ldr s27, [%[inptr0], x15]\n"
    "str s15, [x25, %[output_col_stride1]]\n"
    "fmla v18.4s, v20.4s, v4.4s\n"
    "fmla v19.4s, v20.4s, v7.4s\n"
    "ldr s15, [x10]\n"
    "fmla v22.4s, v20.4s, v5.4s\n"
    "ldr s6, [%[wbptr], #28]\n"
    "str s18, [x24, x26]\n"
    "fmla v21.4s, v20.4s, v8.4s\n"
    "fmla v17.4s, v23.4s, v4.4s\n"
    "ldr s18, [x9, %[input_col_stride1]]\n"
    "fmla v22.4s, v23.4s, v7.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmla v21.4s, v23.4s, v10.4s\n"
    "ldr s8, [%[wbptr], #20]\n"
    "str s17, [x23, x27]\n"
    "fmla v19.4s, v24.4s, v4.4s\n"
    "fmla v22.4s, v16.4s, v4.4s\n"
    "add x23, x23, #4\n"
    "fmla v21.4s, v24.4s, v5.4s\n"
    "ldr s10, [%[wbptr], #12]\n"
    "str s19, [x25, x26]\n"
    "mov v17.16b, v14.16b\n"
    "str s22, [x24, x27]\n"
    "mov v23.16b, v14.16b\n"
    "fmla v21.4s, v16.4s, v7.4s\n"
    "ldr s5, [%[wbptr], #32]\n"
    "mov v24.16b, v14.16b\n"
    "add x24, x24, #4\n"
    "mov v20.16b, v14.16b\n"
    "mov v16.16b, v14.16b\n"
    "fmla v21.4s, v13.4s, v4.4s\n"
    "ldr s7, [%[wbptr], #24]\n"
    "mov v13.16b, v14.16b\n"
    "mov v0.16b, v14.16b\n"
    "mov v1.16b, v14.16b\n"
    "mov v2.16b, v14.16b\n"
    "str s21, [x25, x27]\n"
    "mov v3.16b, v14.16b\n"
    "ldr s4, [%[wbptr], #36]\n"
    "add x25, x25, #4\n"
    "fmla v17.4s, v29.4s, v12.4s\n"
    "bne 5b\n"
    "6:\n"
    "fmla v17.4s, v28.4s, v9.4s\n"
    "prfm pldl1keep, [x8, x16]\n"
    "fmla v23.4s, v28.4s, v12.4s\n"
    "ldr s22, [x8, x15]\n"
    "fmla v24.4s, v30.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], x7]\n"
    "fmla v17.4s, v30.4s, v11.4s\n"
    "ldr s29, [%[inptr0], x17]\n"
    "fmla v23.4s, v25.4s, v9.4s\n"
    "prfm pldl1keep, [x11, #64]\n"
    "fmla v20.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x10, x28]\n"
    "fmla v17.4s, v25.4s, v6.4s\n"
    "ldr s25, [x11]\n"
    "fmla v23.4s, v26.4s, v11.4s\n"
    "prfm pldl1keep, [x9, x16]\n"
    "fmla v24.4s, v26.4s, v9.4s\n"
    "prfm pldl1keep, [x8, x7]\n"
    "fmla v17.4s, v26.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x20]\n"
    "fmla v16.4s, v26.4s, v12.4s\n"
    "ldr s28, [x10, %[input_col_stride1]]\n"
    "fmla v24.4s, v27.4s, v11.4s\n"
    "prfm pldl1keep, [x12, #64]\n"
    "fmla v17.4s, v27.4s, v10.4s\n"
    "prfm pldl1keep, [x11, x28]\n"
    "fmla v13.4s, v27.4s, v12.4s\n"
    "ldr s19, [x9, x15]\n"
    "fmla v23.4s, v15.4s, v6.4s\n"
    "prfm pldl1keep, [x10, x16]\n"
    "fmla v20.4s, v15.4s, v9.4s\n"
    "prfm pldl1keep, [x9, x7]\n"
    "fmla v0.4s, v15.4s, v12.4s\n"
    "ldr s21, [x8, x17]\n"
    "fmla v17.4s, v18.4s, v5.4s\n"
    "prfm pldl1keep, [x8, x20]\n"
    "fmla v23.4s, v18.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x22]\n"
    "fmla v24.4s, v18.4s, v6.4s\n"
    "prfm pldl1keep, [x12, x28]\n"
    "fmla v20.4s, v18.4s, v11.4s\n"
    "prfm pldl1keep, [x11, x16]\n"
    "fmla v16.4s, v18.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x7]\n"
    "fmla v1.4s, v18.4s, v12.4s\n"
    "ldr s27, [%[inptr0], x19]\n"
    "fmla v17.4s, v22.4s, v7.4s\n"
    "prfm pldl1keep, [x9, x20]\n"
    "fmla v23.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [x8, x22]\n"
    "fmla v24.4s, v22.4s, v8.4s\n"
    "prfm pldl1keep, [x12, x16]\n"
    "fmla v16.4s, v22.4s, v11.4s\n"
    "prfm pldl1keep, [x11, x7]\n"
    "fmla v13.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x20]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "ldr s18, [x12]\n"
    "fmla v24.4s, v29.4s, v10.4s\n"
    "prfm pldl1keep, [x9, x22]\n"
    "fmla v13.4s, v29.4s, v11.4s\n"
    "prfm pldl1keep, [x12, x7]\n"
    "fmla v3.4s, v29.4s, v12.4s\n"
    "ldr s22, [x11, %[input_col_stride1]]\n"
    "fmla v20.4s, v25.4s, v6.4s\n"
    "prfm pldl1keep, [x11, x20]\n"
    "fmla v0.4s, v25.4s, v9.4s\n"
    "ldr s25, [x10, x15]\n"
    "fmla v23.4s, v28.4s, v5.4s\n"
    "prfm pldl1keep, [x10, x22]\n"
    "fmla v20.4s, v28.4s, v8.4s\n"
    "prfm pldl1keep, [x12, x20]\n"
    "fmla v16.4s, v28.4s, v6.4s\n"
    "prfm pldl1keep, [x11, x22]\n"
    "fmla v0.4s, v28.4s, v11.4s\n"
    "prfm pldl1keep, [x12, x22]\n"
    "fmla v1.4s, v28.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v17.4s, v19.4s, v4.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v23.4s, v19.4s, v7.4s\n"
    "fmla v24.4s, v19.4s, v5.4s\n"
    "fmla v20.4s, v19.4s, v10.4s\n"
    "fmla v16.4s, v19.4s, v8.4s\n"
    "str s17, [%[outptr0]]\n"
    "mov v15.16b, v14.16b\n"
    "fmla v13.4s, v19.4s, v6.4s\n"
    "fmla v1.4s, v19.4s, v11.4s\n"
    "fmla v15.4s, v28.4s, v12.4s\n"
    "ldr s29, [x9, x17]\n"
    "fmla v2.4s, v19.4s, v9.4s\n"
    "fmla v24.4s, v21.4s, v7.4s\n"
    "fmla v16.4s, v21.4s, v10.4s\n"
    "fmla v13.4s, v21.4s, v8.4s\n"
    "fmla v3.4s, v21.4s, v9.4s\n"
    "fmla v0.4s, v18.4s, v6.4s\n"
    "mov v18.16b, v14.16b\n"
    "fmla v2.4s, v21.4s, v11.4s\n"
    "fmla v13.4s, v27.4s, v10.4s\n"
    "fmla v20.4s, v22.4s, v5.4s\n"
    "fmla v18.4s, v19.4s, v12.4s\n"
    "ldr s26, [x8, x19]\n"
    "fmla v3.4s, v27.4s, v11.4s\n"
    "ldr s28, [%[inptr0], x21]\n"
    "fmla v0.4s, v22.4s, v8.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v1.4s, v22.4s, v6.4s\n"
    "fmla v15.4s, v22.4s, v9.4s\n"
    "mov v17.16b, v14.16b\n"
    "fmla v23.4s, v25.4s, v4.4s\n"
    "fmla v20.4s, v25.4s, v7.4s\n"
    "fmla v16.4s, v25.4s, v5.4s\n"
    "fmla v17.4s, v21.4s, v12.4s\n"
    "ldr s30, [x12, %[input_col_stride1]]\n"
    "str s23, [x23]\n"
    "mov v19.16b, v14.16b\n"
    "fmla v0.4s, v25.4s, v10.4s\n"
    "fmla v1.4s, v25.4s, v8.4s\n"
    "fmla v2.4s, v25.4s, v6.4s\n"
    "fmla v15.4s, v25.4s, v11.4s\n"
    "fmla v18.4s, v25.4s, v9.4s\n"
    "fmla v19.4s, v25.4s, v12.4s\n"
    "mov v22.16b, v14.16b\n"
    "mov v21.16b, v14.16b\n"
    "fmla v24.4s, v29.4s, v4.4s\n"
    "fmla v16.4s, v29.4s, v7.4s\n"
    "fmla v13.4s, v29.4s, v5.4s\n"
    "fmla v1.4s, v29.4s, v10.4s\n"
    "fmla v2.4s, v29.4s, v8.4s\n"
    "fmla v3.4s, v29.4s, v6.4s\n"
    "str s24, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v18.4s, v29.4s, v11.4s\n"
    "fmla v17.4s, v29.4s, v9.4s\n"
    "ldr s27, [x11, x15]\n"
    "fmla v22.4s, v29.4s, v12.4s\n"
    "ldr s23, [x10, x17]\n"
    "fmla v13.4s, v26.4s, v7.4s\n"
    "fmla v2.4s, v26.4s, v10.4s\n"
    "fmla v3.4s, v26.4s, v8.4s\n"
    "fmla v17.4s, v26.4s, v11.4s\n"
    "fmla v0.4s, v30.4s, v5.4s\n"
    "ldr s24, [x9, x19]\n"
    "fmla v15.4s, v30.4s, v6.4s\n"
    "ldr s29, [x8, x21]\n"
    "fmla v3.4s, v28.4s, v10.4s\n"
    "ldr s14, [x12, x15]\n"
    "fmla v20.4s, v27.4s, v4.4s\n"
    "add x8, x8, #4\n"
    "fmla v0.4s, v27.4s, v7.4s\n"
    "fmla v1.4s, v27.4s, v5.4s\n"
    "fmla v15.4s, v27.4s, v8.4s\n"
    "fmla v18.4s, v27.4s, v6.4s\n"
    "str s20, [x24]\n"
    "fmla v19.4s, v27.4s, v9.4s\n"
    "fmla v16.4s, v23.4s, v4.4s\n"
    "ldr s25, [x11, x17]\n"
    "fmla v1.4s, v23.4s, v7.4s\n"
    "ldr s30, [x10, x19]\n"
    "fmla v2.4s, v23.4s, v5.4s\n"
    "fmla v15.4s, v23.4s, v10.4s\n"
    "str s16, [x23, %[output_col_stride1]]\n"
    "fmla v18.4s, v23.4s, v8.4s\n"
    "fmla v17.4s, v23.4s, v6.4s\n"
    "ldr s26, [x9, x21]\n"
    "fmla v19.4s, v23.4s, v11.4s\n"
    "add x9, x9, #4\n"
    "fmla v22.4s, v23.4s, v9.4s\n"
    "fmla v21.4s, v23.4s, v12.4s\n"
    "fmla v13.4s, v24.4s, v4.4s\n"
    "ldr s27, [x12, x17]\n"
    "fmla v2.4s, v24.4s, v7.4s\n"
    "ldr s20, [x11, x19]\n"
    "fmla v3.4s, v24.4s, v5.4s\n"
    "fmla v18.4s, v24.4s, v10.4s\n"
    "str s13, [%[outptr0], x26]\n"
    "fmla v17.4s, v24.4s, v8.4s\n"
    "fmla v22.4s, v24.4s, v11.4s\n"
    "ldr s23, [x10, x21]\n"
    "fmla v3.4s, v29.4s, v7.4s\n"
    "ldr s24, [x12, x19]\n"
    "fmla v17.4s, v29.4s, v10.4s\n"
    "ldr s16, [x11, x21]\n"
    "fmla v0.4s, v14.4s, v4.4s\n"
    "add x10, x10, #4\n"
    "fmla v15.4s, v14.4s, v5.4s\n"
    "add x11, x11, #4\n"
    "fmla v19.4s, v14.4s, v6.4s\n"
    "ldr s13, [x12, x21]\n"
    "str s0, [x25]\n"
    "fmla v1.4s, v25.4s, v4.4s\n"
    "fmla v15.4s, v25.4s, v7.4s\n"
    "add x12, x12, #4\n"
    "fmla v18.4s, v25.4s, v5.4s\n"
    "fmla v19.4s, v25.4s, v8.4s\n"
    "str s1, [x24, %[output_col_stride1]]\n"
    "fmla v22.4s, v25.4s, v6.4s\n"
    "fmla v21.4s, v25.4s, v9.4s\n"
    "fmla v2.4s, v30.4s, v4.4s\n"
    "fmla v18.4s, v30.4s, v7.4s\n"
    "fmla v17.4s, v30.4s, v5.4s\n"
    "fmla v19.4s, v30.4s, v10.4s\n"
    "fmla v22.4s, v30.4s, v8.4s\n"
    "str s2, [x23, x26]\n"
    "fmla v21.4s, v30.4s, v11.4s\n"
    "fmla v3.4s, v26.4s, v4.4s\n"
    "fmla v17.4s, v26.4s, v7.4s\n"
    "fmla v22.4s, v26.4s, v10.4s\n"
    "fmla v15.4s, v27.4s, v4.4s\n"
    "fmla v19.4s, v27.4s, v5.4s\n"
    "fmla v21.4s, v27.4s, v6.4s\n"
    "str s3, [%[outptr0], x27]\n"
    "fmla v18.4s, v20.4s, v4.4s\n"
    "str s15, [x25, %[output_col_stride1]]\n"
    "fmla v22.4s, v20.4s, v5.4s\n"
    "fmla v19.4s, v20.4s, v7.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "str s18, [x24, x26]\n"
    "fmla v21.4s, v20.4s, v8.4s\n"
    "fmla v17.4s, v23.4s, v4.4s\n"
    "fmla v22.4s, v23.4s, v7.4s\n"
    "fmla v19.4s, v24.4s, v4.4s\n"
    "fmla v21.4s, v23.4s, v10.4s\n"
    "str s17, [x23, x27]\n"
    "fmla v22.4s, v16.4s, v4.4s\n"
    "str s19, [x25, x26]\n"
    "add x23, x23, #4\n"
    "fmla v21.4s, v24.4s, v5.4s\n"
    "str s22, [x24, x27]\n"
    "add x24, x24, #4\n"
    "fmla v21.4s, v16.4s, v7.4s\n"
    "fmla v21.4s, v13.4s, v4.4s\n"
    "str s21, [x25, x27]\n"
    "add x25, x25, #4\n"
    "7:\n"
    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels)
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
  );
}

template <>
template <>
void Conv::execute_tile<ActivationFunction::None>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *inptrs[6][6],
  float *outptrs[4][4]
)
{
  __asm __volatile(
    "mov x27, xzr\n"
    "mov x28, xzr\n"
    "and x15, %[n_channels], #3\n"
    "lsr x16, %[n_channels], #2\n"
    "cbz x16, 4f\n"
    "1:\n"
    "ldr q13, [%[wbptr]]\n"
    "ldr x17, [%[inptrs], 0]\n"
    "mov v18.16b, v13.16b\n"
    "ldr q12, [%[wbptr], #16]\n"
    "mov v22.16b, v13.16b\n"
    "ldr q11, [%[wbptr], #32]\n"
    "mov v23.16b, v13.16b\n"
    "ldr q10, [%[wbptr], #48]\n"
    "mov v19.16b, v13.16b\n"
    "ldr q9, [%[wbptr], #64]\n"
    "mov v17.16b, v13.16b\n"
    "ldr q8, [%[wbptr], #80]\n"
    "mov v14.16b, v13.16b\n"
    "ldr q7, [%[wbptr], #96]\n"
    "mov v0.16b, v13.16b\n"
    "ldr q6, [%[wbptr], #112]\n"
    "mov v1.16b, v13.16b\n"
    "ldr q5, [%[wbptr], #128]\n"
    "mov v2.16b, v13.16b\n"
    "ldr q4, [%[wbptr], #144]\n"
    "ldr q29, [x17, x27]\n"
    "ldr x7, [%[inptrs], 48]\n"
    "fmla v18.4s, v29.4s, v12.4s\n"
    "ldr x17, [%[inptrs], 8]\n"
    "ldr q27, [x7, x27]\n"
    "ldr x19, [%[inptrs], 96]\n"
    "ldr q28, [x17, x27]\n"
    "ldr x7, [%[inptrs], 56]\n"
    "ldr q25, [x19, x27]\n"
    "ldr x17, [%[inptrs], 16]\n"
    "ldr q16, [x7, x27]\n"
    "ldr x20, [%[inptrs], 144]\n"
    "ldr q15, [x17, x27]\n"
    "ldr x19, [%[inptrs], 104]\n"
    "ldr q21, [x20, x27]\n"
    "subs x16, x16, #1\n"
    "ldr q29, [x19, x27]\n"
    "beq 3f\n"
    "2:\n"
    "mov v3.16b, v13.16b\n"
    "ldr x7, [%[inptrs], 64]\n"
    "fmla v18.4s, v27.4s, v9.4s\n"
    "ldr x17, [%[inptrs], 24]\n"
    "fmla v22.4s, v27.4s, v12.4s\n"
    "ldr q30, [x7, x27]\n"
    "fmla v23.4s, v28.4s, v12.4s\n"
    "ldr x21, [%[inptrs], 192]\n"
    "fmla v19.4s, v25.4s, v12.4s\n"
    "ldr x20, [%[inptrs], 152]\n"
    "fmla v18.4s, v28.4s, v11.4s\n"
    "ldr q24, [x17, x27]\n"
    "fmla v22.4s, v25.4s, v9.4s\n"
    "ldr x19, [%[inptrs], 112]\n"
    "fmla v23.4s, v16.4s, v9.4s\n"
    "ldr x7, [%[inptrs], 72]\n"
    "fmla v17.4s, v16.4s, v12.4s\n"
    "ldr x17, [%[inptrs], 32]\n"
    "fmla v18.4s, v25.4s, v6.4s\n"
    "ldr q31, [x21, x27]\n"
    "fmla v22.4s, v16.4s, v11.4s\n"
    "ldr x22, [%[inptrs], 240]\n"
    "fmla v23.4s, v15.4s, v11.4s\n"
    "ldr x21, [%[inptrs], 200]\n"
    "fmla v14.4s, v15.4s, v12.4s\n"
    "ldr x23, [%[outptrs], 0]\n"
    "fmla v18.4s, v16.4s, v8.4s\n"
    "ldr q25, [x20, x27]\n"
    "fmla v22.4s, v21.4s, v6.4s\n"
    "ldr x20, [%[inptrs], 160]\n"
    "fmla v19.4s, v21.4s, v9.4s\n"
    "ldr x24, [%[outptrs], 32]\n"
    "fmla v0.4s, v21.4s, v12.4s\n"
    "ldr q21, [x19, x27]\n"
    "fmla v18.4s, v15.4s, v10.4s\n"
    "ldr q20, [x7, x27]\n"
    "fmla v22.4s, v29.4s, v8.4s\n"
    "ldr x19, [%[inptrs], 120]\n"
    "fmla v23.4s, v29.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 80]\n"
    "fmla v19.4s, v29.4s, v11.4s\n"
    "ldr x25, [%[outptrs], 64]\n"
    "fmla v18.4s, v29.4s, v5.4s\n"
    "ldr x26, [%[outptrs], 96]\n"
    "fmla v17.4s, v29.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v1.4s, v29.4s, v12.4s\n"
    "ldr q26, [x17, x27]\n"
    "fmla v22.4s, v30.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v18.4s, v30.4s, v7.4s\n"
    "ldr x17, [%[inptrs], 40]\n"
    "fmla v23.4s, v30.4s, v8.4s\n"
    "subs x16, x16, #1\n"
    "fmla v17.4s, v30.4s, v11.4s\n"
    "fmla v14.4s, v30.4s, v9.4s\n"
    "fmla v2.4s, v30.4s, v12.4s\n"
    "ldr q27, [x22, x27]\n"
    "fmla v3.4s, v24.4s, v12.4s\n"
    "ldr x22, [%[inptrs], 248]\n"
    "fmla v23.4s, v24.4s, v10.4s\n"
    "fmla v19.4s, v31.4s, v6.4s\n"
    "fmla v14.4s, v24.4s, v11.4s\n"
    "ldr q30, [x21, x27]\n"
    "fmla v0.4s, v31.4s, v9.4s\n"
    "ldr q24, [x20, x27]\n"
    "fmla v22.4s, v25.4s, v5.4s\n"
    "ldr x21, [%[inptrs], 208]\n"
    "fmla v19.4s, v25.4s, v8.4s\n"
    "ldr x20, [%[inptrs], 168]\n"
    "fmla v17.4s, v25.4s, v6.4s\n"
    "fmla v1.4s, v25.4s, v9.4s\n"
    "fmla v0.4s, v25.4s, v11.4s\n"
    "fmla v18.4s, v21.4s, v4.4s\n"
    "fmla v22.4s, v21.4s, v7.4s\n"
    "fmla v23.4s, v21.4s, v5.4s\n"
    "fmla v19.4s, v21.4s, v10.4s\n"
    "fmla v14.4s, v21.4s, v6.4s\n"
    "fmla v17.4s, v21.4s, v8.4s\n"
    "fmla v1.4s, v21.4s, v11.4s\n"
    "str q18, [x23, x28]\n"
    "mov v16.16b, v13.16b\n"
    "fmla v2.4s, v21.4s, v9.4s\n"
    "ldr x23, [%[outptrs], 8]\n"
    "fmla v23.4s, v20.4s, v7.4s\n"
    "fmla v14.4s, v20.4s, v8.4s\n"
    "fmla v16.4s, v25.4s, v12.4s\n"
    "ldr q25, [x19, x27]\n"
    "fmla v17.4s, v20.4s, v10.4s\n"
    "ldr x19, [%[inptrs], 128]\n"
    "fmla v2.4s, v20.4s, v11.4s\n"
    "fmla v3.4s, v20.4s, v9.4s\n"
    "fmla v14.4s, v26.4s, v10.4s\n"
    "fmla v0.4s, v27.4s, v6.4s\n"
    "mov v15.16b, v13.16b\n"
    "fmla v19.4s, v30.4s, v5.4s\n"
    "fmla v1.4s, v30.4s, v6.4s\n"
    "fmla v16.4s, v30.4s, v9.4s\n"
    "fmla v3.4s, v26.4s, v11.4s\n"
    "ldr q29, [x7, x27]\n"
    "fmla v15.4s, v21.4s, v12.4s\n"
    "ldr q27, [x17, x27]\n"
    "fmla v0.4s, v30.4s, v8.4s\n"
    "ldr q28, [x22, x27]\n"
    "fmla v22.4s, v24.4s, v4.4s\n"
    "ldr x7, [%[inptrs], 88]\n"
    "fmla v19.4s, v24.4s, v7.4s\n"
    "ldr x22, [%[inptrs], 256]\n"
    "fmla v17.4s, v24.4s, v5.4s\n"
    "ldr x17, [%[inptrs], 0]\n"
    "fmla v0.4s, v24.4s, v10.4s\n"
    "fmla v1.4s, v24.4s, v8.4s\n"
    "str q22, [x24, x28]\n"
    "mov v18.16b, v13.16b\n"
    "fmla v2.4s, v24.4s, v6.4s\n"
    "ldr x24, [%[outptrs], 40]\n"
    "fmla v16.4s, v24.4s, v11.4s\n"
    "fmla v15.4s, v24.4s, v9.4s\n"
    "fmla v18.4s, v20.4s, v12.4s\n"
    "ldr q22, [x21, x27]\n"
    "fmla v23.4s, v25.4s, v4.4s\n"
    "ldr x21, [%[inptrs], 216]\n"
    "fmla v17.4s, v25.4s, v7.4s\n"
    "fmla v14.4s, v25.4s, v5.4s\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "fmla v2.4s, v25.4s, v8.4s\n"
    "fmla v3.4s, v25.4s, v6.4s\n"
    "fmla v15.4s, v25.4s, v11.4s\n"
    "str q23, [x23, x28]\n"
    "mov v21.16b, v13.16b\n"
    "fmla v18.4s, v25.4s, v9.4s\n"
    "ldr x23, [%[outptrs], 16]\n"
    "fmla v14.4s, v29.4s, v7.4s\n"
    "fmla v2.4s, v29.4s, v10.4s\n"
    "fmla v21.4s, v24.4s, v12.4s\n"
    "ldr q30, [x20, x27]\n"
    "fmla v3.4s, v29.4s, v8.4s\n"
    "ldr x20, [%[inptrs], 176]\n"
    "fmla v18.4s, v29.4s, v11.4s\n"
    "ldr q31, [x19, x27]\n"
    "fmla v0.4s, v28.4s, v5.4s\n"
    "ldr x19, [%[inptrs], 136]\n"
    "fmla v16.4s, v28.4s, v6.4s\n"
    "ldr q26, [x7, x27]\n"
    "fmla v3.4s, v27.4s, v10.4s\n"
    "ldr q23, [x22, x27]\n"
    "fmla v19.4s, v22.4s, v4.4s\n"
    "ldr x22, [%[inptrs], 264]\n"
    "fmla v0.4s, v22.4s, v7.4s\n"
    "ldr x7, [%[inptrs], 48]\n"
    "fmla v1.4s, v22.4s, v5.4s\n"
    "fmla v16.4s, v22.4s, v8.4s\n"
    "fmla v15.4s, v22.4s, v6.4s\n"
    "fmla v21.4s, v22.4s, v9.4s\n"
    "str q19, [x25, x28]\n"
    "mov v24.16b, v13.16b\n"
    "mov v20.16b, v13.16b\n"
    "ldr q27, [x21, x27]\n"
    "fmla v17.4s, v30.4s, v4.4s\n"
    "ldr x21, [%[inptrs], 224]\n"
    "fmla v24.4s, v25.4s, v12.4s\n"
    "ldr q28, [x20, x27]\n"
    "fmla v1.4s, v30.4s, v7.4s\n"
    "ldr x20, [%[inptrs], 184]\n"
    "fmla v2.4s, v30.4s, v5.4s\n"
    "ldr x25, [%[outptrs], 72]\n"
    "str q17, [x24, x28]\n"
    "fmla v16.4s, v30.4s, v10.4s\n"
    "fmla v15.4s, v30.4s, v8.4s\n"
    "ldr q22, [x19, x27]\n"
    "fmla v18.4s, v30.4s, v6.4s\n"
    "ldr x24, [%[outptrs], 48]\n"
    "fmla v21.4s, v30.4s, v11.4s\n"
    "ldr x19, [%[inptrs], 96]\n"
    "fmla v24.4s, v30.4s, v9.4s\n"
    "fmla v20.4s, v30.4s, v12.4s\n"
    "fmla v14.4s, v31.4s, v4.4s\n"
    "ldr q30, [x22, x27]\n"
    "fmla v2.4s, v31.4s, v7.4s\n"
    "ldr q19, [x21, x27]\n"
    "fmla v3.4s, v31.4s, v5.4s\n"
    "ldr x22, [%[inptrs], 272]\n"
    "fmla v15.4s, v31.4s, v10.4s\n"
    "ldr x21, [%[inptrs], 232]\n"
    "str q14, [x23, x28]\n"
    "fmla v18.4s, v31.4s, v8.4s\n"
    "fmla v24.4s, v31.4s, v11.4s\n"
    "ldr q31, [x20, x27]\n"
    "fmla v3.4s, v26.4s, v7.4s\n"
    "ldr q17, [x22, x27]\n"
    "fmla v0.4s, v23.4s, v4.4s\n"
    "ldr x22, [%[inptrs], 280]\n"
    "fmla v18.4s, v26.4s, v10.4s\n"
    "ldr q14, [x21, x27]\n"
    "fmla v16.4s, v23.4s, v5.4s\n"
    "ldr x23, [%[outptrs], 24]\n"
    "fmla v21.4s, v23.4s, v6.4s\n"
    "ldr q26, [x22, x27]\n"
    "str q0, [x26, x28]\n"
    "fmla v1.4s, v27.4s, v4.4s\n"
    "fmla v15.4s, v27.4s, v5.4s\n"
    "ldr q13, [%[wbptr]]\n"
    "fmla v16.4s, v27.4s, v7.4s\n"
    "ldr x26, [%[outptrs], 104]\n"
    "fmla v21.4s, v27.4s, v8.4s\n"
    "add x27, x27, #16\n"
    "str q1, [x25, x28]\n"
    "fmla v24.4s, v27.4s, v6.4s\n"
    "fmla v20.4s, v27.4s, v9.4s\n"
    "ldr q12, [%[wbptr], #16]\n"
    "fmla v2.4s, v28.4s, v4.4s\n"
    "ldr q29, [x17, x27]\n"
    "fmla v15.4s, v28.4s, v7.4s\n"
    "ldr q27, [x7, x27]\n"
    "fmla v18.4s, v28.4s, v5.4s\n"
    "ldr x25, [%[outptrs], 80]\n"
    "fmla v21.4s, v28.4s, v10.4s\n"
    "ldr x17, [%[inptrs], 8]\n"
    "str q2, [x24, x28]\n"
    "fmla v24.4s, v28.4s, v8.4s\n"
    "fmla v20.4s, v28.4s, v11.4s\n"
    "ldr q9, [%[wbptr], #64]\n"
    "fmla v3.4s, v22.4s, v4.4s\n"
    "ldr q28, [x17, x27]\n"
    "fmla v18.4s, v22.4s, v7.4s\n"
    "ldr q25, [x19, x27]\n"
    "fmla v24.4s, v22.4s, v10.4s\n"
    "ldr x24, [%[outptrs], 56]\n"
    "fmla v16.4s, v30.4s, v4.4s\n"
    "ldr q11, [%[wbptr], #32]\n"
    "str q3, [x23, x28]\n"
    "fmla v21.4s, v30.4s, v5.4s\n"
    "fmla v20.4s, v30.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 56]\n"
    "fmla v15.4s, v19.4s, v4.4s\n"
    "ldr x17, [%[inptrs], 16]\n"
    "str q16, [x26, x28]\n"
    "fmla v24.4s, v19.4s, v5.4s\n"
    "fmla v21.4s, v19.4s, v7.4s\n"
    "ldr q16, [x7, x27]\n"
    "fmla v20.4s, v19.4s, v8.4s\n"
    "ldr q6, [%[wbptr], #112]\n"
    "str q15, [x25, x28]\n"
    "fmla v18.4s, v31.4s, v4.4s\n"
    "fmla v24.4s, v31.4s, v7.4s\n"
    "ldr q15, [x17, x27]\n"
    "fmla v21.4s, v17.4s, v4.4s\n"
    "ldr x25, [%[outptrs], 88]\n"
    "fmla v20.4s, v31.4s, v10.4s\n"
    "ldr q8, [%[wbptr], #80]\n"
    "str q18, [x24, x28]\n"
    "mov v18.16b, v13.16b\n"
    "fmla v24.4s, v14.4s, v4.4s\n"
    "ldr x26, [%[outptrs], 112]\n"
    "mov v22.16b, v13.16b\n"
    "ldr x20, [%[inptrs], 144]\n"
    "str q21, [x26, x28]\n"
    "fmla v20.4s, v17.4s, v5.4s\n"
    "mov v23.16b, v13.16b\n"
    "ldr q10, [%[wbptr], #48]\n"
    "str q24, [x25, x28]\n"
    "mov v19.16b, v13.16b\n"
    "mov v17.16b, v13.16b\n"
    "ldr q21, [x20, x27]\n"
    "fmla v20.4s, v14.4s, v7.4s\n"
    "ldr q5, [%[wbptr], #128]\n"
    "mov v14.16b, v13.16b\n"
    "ldr x26, [%[outptrs], 120]\n"
    "mov v0.16b, v13.16b\n"
    "ldr x19, [%[inptrs], 104]\n"
    "mov v1.16b, v13.16b\n"
    "mov v2.16b, v13.16b\n"
    "fmla v20.4s, v26.4s, v4.4s\n"
    "ldr q7, [%[wbptr], #96]\n"
    "fmla v18.4s, v29.4s, v12.4s\n"
    "ldr q29, [x19, x27]\n"
    "str q20, [x26, x28]\n"
    "ldr q4, [%[wbptr], #144]\n"
    "add x28, x28, #16\n"
    "bne 2b\n"
    "3:\n"
    "mov v3.16b, v13.16b\n"
    "ldr x7, [%[inptrs], 64]\n"
    "fmla v18.4s, v27.4s, v9.4s\n"
    "ldr x17, [%[inptrs], 24]\n"
    "fmla v22.4s, v27.4s, v12.4s\n"
    "ldr q30, [x7, x27]\n"
    "fmla v23.4s, v28.4s, v12.4s\n"
    "ldr x21, [%[inptrs], 192]\n"
    "fmla v19.4s, v25.4s, v12.4s\n"
    "ldr x20, [%[inptrs], 152]\n"
    "fmla v18.4s, v28.4s, v11.4s\n"
    "ldr q24, [x17, x27]\n"
    "fmla v22.4s, v25.4s, v9.4s\n"
    "ldr x19, [%[inptrs], 112]\n"
    "fmla v23.4s, v16.4s, v9.4s\n"
    "ldr x7, [%[inptrs], 72]\n"
    "fmla v17.4s, v16.4s, v12.4s\n"
    "ldr x17, [%[inptrs], 32]\n"
    "fmla v18.4s, v25.4s, v6.4s\n"
    "ldr q31, [x21, x27]\n"
    "fmla v22.4s, v16.4s, v11.4s\n"
    "ldr x22, [%[inptrs], 240]\n"
    "fmla v23.4s, v15.4s, v11.4s\n"
    "ldr x21, [%[inptrs], 200]\n"
    "fmla v14.4s, v15.4s, v12.4s\n"
    "ldr x23, [%[outptrs], 0]\n"
    "fmla v18.4s, v16.4s, v8.4s\n"
    "ldr q25, [x20, x27]\n"
    "fmla v22.4s, v21.4s, v6.4s\n"
    "ldr x20, [%[inptrs], 160]\n"
    "fmla v19.4s, v21.4s, v9.4s\n"
    "ldr x24, [%[outptrs], 32]\n"
    "fmla v0.4s, v21.4s, v12.4s\n"
    "ldr q21, [x19, x27]\n"
    "fmla v18.4s, v15.4s, v10.4s\n"
    "ldr q20, [x7, x27]\n"
    "fmla v22.4s, v29.4s, v8.4s\n"
    "ldr x19, [%[inptrs], 120]\n"
    "fmla v23.4s, v29.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 80]\n"
    "fmla v19.4s, v29.4s, v11.4s\n"
    "ldr x25, [%[outptrs], 64]\n"
    "fmla v18.4s, v29.4s, v5.4s\n"
    "ldr x26, [%[outptrs], 96]\n"
    "fmla v17.4s, v29.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v1.4s, v29.4s, v12.4s\n"
    "ldr q26, [x17, x27]\n"
    "fmla v22.4s, v30.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v18.4s, v30.4s, v7.4s\n"
    "ldr x17, [%[inptrs], 40]\n"
    "fmla v23.4s, v30.4s, v8.4s\n"
    "fmla v17.4s, v30.4s, v11.4s\n"
    "fmla v14.4s, v30.4s, v9.4s\n"
    "fmla v2.4s, v30.4s, v12.4s\n"
    "mov v16.16b, v13.16b\n"
    "fmla v3.4s, v24.4s, v12.4s\n"
    "fmla v19.4s, v31.4s, v6.4s\n"
    "fmla v0.4s, v31.4s, v9.4s\n"
    "mov v15.16b, v13.16b\n"
    "fmla v23.4s, v24.4s, v10.4s\n"
    "fmla v14.4s, v24.4s, v11.4s\n"
    "ldr q27, [x22, x27]\n"
    "fmla v22.4s, v25.4s, v5.4s\n"
    "ldr x22, [%[inptrs], 248]\n"
    "fmla v19.4s, v25.4s, v8.4s\n"
    "fmla v17.4s, v25.4s, v6.4s\n"
    "fmla v0.4s, v25.4s, v11.4s\n"
    "fmla v1.4s, v25.4s, v9.4s\n"
    "fmla v16.4s, v25.4s, v12.4s\n"
    "ldr q30, [x21, x27]\n"
    "fmla v18.4s, v21.4s, v4.4s\n"
    "ldr x21, [%[inptrs], 208]\n"
    "fmla v22.4s, v21.4s, v7.4s\n"
    "fmla v23.4s, v21.4s, v5.4s\n"
    "fmla v19.4s, v21.4s, v10.4s\n"
    "fmla v17.4s, v21.4s, v8.4s\n"
    "fmla v14.4s, v21.4s, v6.4s\n"
    "fmla v1.4s, v21.4s, v11.4s\n"
    "str q18, [x23, x28]\n"
    "mov v18.16b, v13.16b\n"
    "fmla v2.4s, v21.4s, v9.4s\n"
    "ldr x23, [%[outptrs], 8]\n"
    "fmla v15.4s, v21.4s, v12.4s\n"
    "ldr q24, [x20, x27]\n"
    "fmla v23.4s, v20.4s, v7.4s\n"
    "ldr x20, [%[inptrs], 168]\n"
    "fmla v17.4s, v20.4s, v10.4s\n"
    "fmla v14.4s, v20.4s, v8.4s\n"
    "fmla v2.4s, v20.4s, v11.4s\n"
    "fmla v3.4s, v20.4s, v9.4s\n"
    "fmla v18.4s, v20.4s, v12.4s\n"
    "ldr q25, [x19, x27]\n"
    "fmla v0.4s, v27.4s, v6.4s\n"
    "ldr q29, [x7, x27]\n"
    "fmla v14.4s, v26.4s, v10.4s\n"
    "ldr x19, [%[inptrs], 128]\n"
    "fmla v3.4s, v26.4s, v11.4s\n"
    "ldr q27, [x17, x27]\n"
    "fmla v19.4s, v30.4s, v5.4s\n"
    "ldr x7, [%[inptrs], 88]\n"
    "fmla v0.4s, v30.4s, v8.4s\n"
    "fmla v1.4s, v30.4s, v6.4s\n"
    "fmla v16.4s, v30.4s, v9.4s\n"
    "ldr q28, [x22, x27]\n"
    "fmla v22.4s, v24.4s, v4.4s\n"
    "ldr x22, [%[inptrs], 256]\n"
    "fmla v19.4s, v24.4s, v7.4s\n"
    "fmla v17.4s, v24.4s, v5.4s\n"
    "fmla v0.4s, v24.4s, v10.4s\n"
    "fmla v1.4s, v24.4s, v8.4s\n"
    "fmla v2.4s, v24.4s, v6.4s\n"
    "fmla v16.4s, v24.4s, v11.4s\n"
    "str q22, [x24, x28]\n"
    "mov v21.16b, v13.16b\n"
    "fmla v15.4s, v24.4s, v9.4s\n"
    "ldr x24, [%[outptrs], 40]\n"
    "fmla v23.4s, v25.4s, v4.4s\n"
    "fmla v17.4s, v25.4s, v7.4s\n"
    "fmla v21.4s, v24.4s, v12.4s\n"
    "ldr q22, [x21, x27]\n"
    "fmla v14.4s, v25.4s, v5.4s\n"
    "ldr x21, [%[inptrs], 216]\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "fmla v2.4s, v25.4s, v8.4s\n"
    "str q23, [x23, x28]\n"
    "mov v24.16b, v13.16b\n"
    "mov v20.16b, v13.16b\n"
    "ldr x23, [%[outptrs], 16]\n"
    "fmla v3.4s, v25.4s, v6.4s\n"
    "fmla v15.4s, v25.4s, v11.4s\n"
    "fmla v18.4s, v25.4s, v9.4s\n"
    "fmla v24.4s, v25.4s, v12.4s\n"
    "fmla v14.4s, v29.4s, v7.4s\n"
    "ldr q30, [x20, x27]\n"
    "fmla v2.4s, v29.4s, v10.4s\n"
    "ldr x20, [%[inptrs], 176]\n"
    "fmla v3.4s, v29.4s, v8.4s\n"
    "fmla v0.4s, v28.4s, v5.4s\n"
    "fmla v18.4s, v29.4s, v11.4s\n"
    "ldr q31, [x19, x27]\n"
    "fmla v16.4s, v28.4s, v6.4s\n"
    "ldr q26, [x7, x27]\n"
    "fmla v19.4s, v22.4s, v4.4s\n"
    "ldr x19, [%[inptrs], 136]\n"
    "fmla v3.4s, v27.4s, v10.4s\n"
    "ldr q23, [x22, x27]\n"
    "fmla v0.4s, v22.4s, v7.4s\n"
    "ldr x22, [%[inptrs], 264]\n"
    "fmla v1.4s, v22.4s, v5.4s\n"
    "fmla v16.4s, v22.4s, v8.4s\n"
    "str q19, [x25, x28]\n"
    "fmla v15.4s, v22.4s, v6.4s\n"
    "fmla v21.4s, v22.4s, v9.4s\n"
    "ldr q27, [x21, x27]\n"
    "fmla v17.4s, v30.4s, v4.4s\n"
    "ldr q28, [x20, x27]\n"
    "fmla v1.4s, v30.4s, v7.4s\n"
    "ldr x21, [%[inptrs], 224]\n"
    "fmla v2.4s, v30.4s, v5.4s\n"
    "ldr x20, [%[inptrs], 184]\n"
    "fmla v16.4s, v30.4s, v10.4s\n"
    "ldr x25, [%[outptrs], 72]\n"
    "str q17, [x24, x28]\n"
    "fmla v15.4s, v30.4s, v8.4s\n"
    "fmla v18.4s, v30.4s, v6.4s\n"
    "ldr q22, [x19, x27]\n"
    "fmla v21.4s, v30.4s, v11.4s\n"
    "ldr x24, [%[outptrs], 48]\n"
    "fmla v24.4s, v30.4s, v9.4s\n"
    "fmla v20.4s, v30.4s, v12.4s\n"
    "fmla v14.4s, v31.4s, v4.4s\n"
    "ldr q30, [x22, x27]\n"
    "fmla v2.4s, v31.4s, v7.4s\n"
    "ldr q19, [x21, x27]\n"
    "fmla v3.4s, v31.4s, v5.4s\n"
    "ldr x22, [%[inptrs], 272]\n"
    "fmla v15.4s, v31.4s, v10.4s\n"
    "ldr x21, [%[inptrs], 232]\n"
    "str q14, [x23, x28]\n"
    "fmla v18.4s, v31.4s, v8.4s\n"
    "fmla v24.4s, v31.4s, v11.4s\n"
    "ldr q31, [x20, x27]\n"
    "fmla v3.4s, v26.4s, v7.4s\n"
    "ldr q17, [x22, x27]\n"
    "fmla v0.4s, v23.4s, v4.4s\n"
    "ldr x22, [%[inptrs], 280]\n"
    "fmla v18.4s, v26.4s, v10.4s\n"
    "ldr q14, [x21, x27]\n"
    "fmla v16.4s, v23.4s, v5.4s\n"
    "ldr x23, [%[outptrs], 24]\n"
    "fmla v21.4s, v23.4s, v6.4s\n"
    "ldr q26, [x22, x27]\n"
    "str q0, [x26, x28]\n"
    "fmla v1.4s, v27.4s, v4.4s\n"
    "fmla v15.4s, v27.4s, v5.4s\n"
    "ldr x26, [%[outptrs], 104]\n"
    "fmla v16.4s, v27.4s, v7.4s\n"
    "add x27, x27, #16\n"
    "fmla v21.4s, v27.4s, v8.4s\n"
    "fmla v24.4s, v27.4s, v6.4s\n"
    "str q1, [x25, x28]\n"
    "fmla v20.4s, v27.4s, v9.4s\n"
    "fmla v2.4s, v28.4s, v4.4s\n"
    "ldr x25, [%[outptrs], 80]\n"
    "fmla v15.4s, v28.4s, v7.4s\n"
    "fmla v18.4s, v28.4s, v5.4s\n"
    "fmla v21.4s, v28.4s, v10.4s\n"
    "fmla v24.4s, v28.4s, v8.4s\n"
    "fmla v20.4s, v28.4s, v11.4s\n"
    "fmla v3.4s, v22.4s, v4.4s\n"
    "str q2, [x24, x28]\n"
    "fmla v16.4s, v30.4s, v4.4s\n"
    "fmla v18.4s, v22.4s, v7.4s\n"
    "ldr x24, [%[outptrs], 56]\n"
    "fmla v24.4s, v22.4s, v10.4s\n"
    "fmla v21.4s, v30.4s, v5.4s\n"
    "str q3, [x23, x28]\n"
    "fmla v20.4s, v30.4s, v6.4s\n"
    "str q16, [x26, x28]\n"
    "fmla v15.4s, v19.4s, v4.4s\n"
    "fmla v18.4s, v31.4s, v4.4s\n"
    "ldr x26, [%[outptrs], 112]\n"
    "fmla v21.4s, v19.4s, v7.4s\n"
    "fmla v24.4s, v19.4s, v5.4s\n"
    "fmla v20.4s, v19.4s, v8.4s\n"
    "str q15, [x25, x28]\n"
    "str q18, [x24, x28]\n"
    "ldr x25, [%[outptrs], 88]\n"
    "fmla v24.4s, v31.4s, v7.4s\n"
    "fmla v21.4s, v17.4s, v4.4s\n"
    "fmla v20.4s, v31.4s, v10.4s\n"
    "str q21, [x26, x28]\n"
    "fmla v20.4s, v17.4s, v5.4s\n"
    "ldr x26, [%[outptrs], 120]\n"
    "fmla v24.4s, v14.4s, v4.4s\n"
    "fmla v20.4s, v14.4s, v7.4s\n"
    "str q24, [x25, x28]\n"
    "fmla v20.4s, v26.4s, v4.4s\n"
    "str q20, [x26, x28]\n"
    "add x28, x28, #16\n"
    "4:\n"
    "cbz x15, 7f\n"
    "ldr s13, [%[wbptr]]\n"
    "mov v18.16b, v13.16b\n"
    "ldr s12, [%[wbptr], #4]\n"
    "mov v22.16b, v13.16b\n"
    "ldr s11, [%[wbptr], #8]\n"
    "mov v23.16b, v13.16b\n"
    "ldr s10, [%[wbptr], #12]\n"
    "mov v19.16b, v13.16b\n"
    "ldr s9, [%[wbptr], #16]\n"
    "mov v17.16b, v13.16b\n"
    "ldr s8, [%[wbptr], #20]\n"
    "mov v14.16b, v13.16b\n"
    "ldr s7, [%[wbptr], #24]\n"
    "mov v0.16b, v13.16b\n"
    "ldr s6, [%[wbptr], #28]\n"
    "mov v1.16b, v13.16b\n"
    "ldr s5, [%[wbptr], #32]\n"
    "mov v2.16b, v13.16b\n"
    "ldr s4, [%[wbptr], #36]\n"
    "ldr x17, [%[inptrs], 0]\n"
    "ldr x7, [%[inptrs], 48]\n"
    "ldr x19, [%[inptrs], 96]\n"
    "ldr x20, [%[inptrs], 144]\n"
    "subs x15, x15, #1\n"
    "ldr s29, [x17, x27]\n"
    "fmla v18.4s, v29.4s, v12.4s\n"
    "ldr s27, [x7, x27]\n"
    "ldr s25, [x19, x27]\n"
    "ldr x17, [%[inptrs], 8]\n"
    "ldr s21, [x20, x27]\n"
    "ldr x7, [%[inptrs], 56]\n"
    "ldr s28, [x17, x27]\n"
    "ldr x19, [%[inptrs], 104]\n"
    "ldr s16, [x7, x27]\n"
    "ldr x17, [%[inptrs], 16]\n"
    "ldr s29, [x19, x27]\n"
    "ldr s15, [x17, x27]\n"
    "beq 6f\n"
    "5:\n"
    "mov v3.16b, v13.16b\n"
    "ldr x7, [%[inptrs], 64]\n"
    "fmla v18.4s, v27.4s, v9.4s\n"
    "ldr x17, [%[inptrs], 24]\n"
    "fmla v22.4s, v27.4s, v12.4s\n"
    "ldr s30, [x7, x27]\n"
    "fmla v23.4s, v28.4s, v12.4s\n"
    "ldr x21, [%[inptrs], 192]\n"
    "fmla v19.4s, v25.4s, v12.4s\n"
    "ldr x20, [%[inptrs], 152]\n"
    "fmla v18.4s, v28.4s, v11.4s\n"
    "ldr s24, [x17, x27]\n"
    "fmla v22.4s, v25.4s, v9.4s\n"
    "ldr x19, [%[inptrs], 112]\n"
    "fmla v23.4s, v16.4s, v9.4s\n"
    "ldr x7, [%[inptrs], 72]\n"
    "fmla v17.4s, v16.4s, v12.4s\n"
    "ldr x17, [%[inptrs], 32]\n"
    "fmla v18.4s, v25.4s, v6.4s\n"
    "ldr s31, [x21, x27]\n"
    "fmla v22.4s, v16.4s, v11.4s\n"
    "ldr x22, [%[inptrs], 240]\n"
    "fmla v23.4s, v15.4s, v11.4s\n"
    "ldr x21, [%[inptrs], 200]\n"
    "fmla v14.4s, v15.4s, v12.4s\n"
    "ldr x23, [%[outptrs], 0]\n"
    "fmla v18.4s, v16.4s, v8.4s\n"
    "ldr s25, [x20, x27]\n"
    "fmla v22.4s, v21.4s, v6.4s\n"
    "ldr x20, [%[inptrs], 160]\n"
    "fmla v19.4s, v21.4s, v9.4s\n"
    "ldr x24, [%[outptrs], 32]\n"
    "fmla v0.4s, v21.4s, v12.4s\n"
    "ldr s21, [x19, x27]\n"
    "fmla v18.4s, v15.4s, v10.4s\n"
    "ldr s20, [x7, x27]\n"
    "fmla v22.4s, v29.4s, v8.4s\n"
    "ldr x19, [%[inptrs], 120]\n"
    "fmla v23.4s, v29.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 80]\n"
    "fmla v19.4s, v29.4s, v11.4s\n"
    "ldr x25, [%[outptrs], 64]\n"
    "fmla v18.4s, v29.4s, v5.4s\n"
    "ldr x26, [%[outptrs], 96]\n"
    "fmla v17.4s, v29.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v1.4s, v29.4s, v12.4s\n"
    "ldr s26, [x17, x27]\n"
    "fmla v22.4s, v30.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v18.4s, v30.4s, v7.4s\n"
    "ldr x17, [%[inptrs], 40]\n"
    "fmla v23.4s, v30.4s, v8.4s\n"
    "subs x15, x15, #1\n"
    "fmla v17.4s, v30.4s, v11.4s\n"
    "fmla v14.4s, v30.4s, v9.4s\n"
    "fmla v2.4s, v30.4s, v12.4s\n"
    "ldr s27, [x22, x27]\n"
    "fmla v3.4s, v24.4s, v12.4s\n"
    "ldr x22, [%[inptrs], 248]\n"
    "fmla v23.4s, v24.4s, v10.4s\n"
    "fmla v19.4s, v31.4s, v6.4s\n"
    "fmla v14.4s, v24.4s, v11.4s\n"
    "ldr s30, [x21, x27]\n"
    "fmla v0.4s, v31.4s, v9.4s\n"
    "ldr s24, [x20, x27]\n"
    "fmla v22.4s, v25.4s, v5.4s\n"
    "ldr x21, [%[inptrs], 208]\n"
    "fmla v19.4s, v25.4s, v8.4s\n"
    "ldr x20, [%[inptrs], 168]\n"
    "fmla v17.4s, v25.4s, v6.4s\n"
    "fmla v1.4s, v25.4s, v9.4s\n"
    "fmla v0.4s, v25.4s, v11.4s\n"
    "fmla v18.4s, v21.4s, v4.4s\n"
    "fmla v22.4s, v21.4s, v7.4s\n"
    "fmla v23.4s, v21.4s, v5.4s\n"
    "fmla v19.4s, v21.4s, v10.4s\n"
    "fmla v14.4s, v21.4s, v6.4s\n"
    "fmla v17.4s, v21.4s, v8.4s\n"
    "fmla v1.4s, v21.4s, v11.4s\n"
    "str s18, [x23, x28]\n"
    "mov v16.16b, v13.16b\n"
    "fmla v2.4s, v21.4s, v9.4s\n"
    "ldr x23, [%[outptrs], 8]\n"
    "fmla v23.4s, v20.4s, v7.4s\n"
    "fmla v14.4s, v20.4s, v8.4s\n"
    "fmla v16.4s, v25.4s, v12.4s\n"
    "ldr s25, [x19, x27]\n"
    "fmla v17.4s, v20.4s, v10.4s\n"
    "ldr x19, [%[inptrs], 128]\n"
    "fmla v2.4s, v20.4s, v11.4s\n"
    "fmla v3.4s, v20.4s, v9.4s\n"
    "fmla v14.4s, v26.4s, v10.4s\n"
    "fmla v0.4s, v27.4s, v6.4s\n"
    "mov v15.16b, v13.16b\n"
    "fmla v19.4s, v30.4s, v5.4s\n"
    "fmla v1.4s, v30.4s, v6.4s\n"
    "fmla v16.4s, v30.4s, v9.4s\n"
    "fmla v3.4s, v26.4s, v11.4s\n"
    "ldr s29, [x7, x27]\n"
    "fmla v15.4s, v21.4s, v12.4s\n"
    "ldr s27, [x17, x27]\n"
    "fmla v0.4s, v30.4s, v8.4s\n"
    "ldr s28, [x22, x27]\n"
    "fmla v22.4s, v24.4s, v4.4s\n"
    "ldr x7, [%[inptrs], 88]\n"
    "fmla v19.4s, v24.4s, v7.4s\n"
    "ldr x22, [%[inptrs], 256]\n"
    "fmla v17.4s, v24.4s, v5.4s\n"
    "ldr x17, [%[inptrs], 0]\n"
    "fmla v0.4s, v24.4s, v10.4s\n"
    "fmla v1.4s, v24.4s, v8.4s\n"
    "str s22, [x24, x28]\n"
    "mov v18.16b, v13.16b\n"
    "fmla v2.4s, v24.4s, v6.4s\n"
    "ldr x24, [%[outptrs], 40]\n"
    "fmla v16.4s, v24.4s, v11.4s\n"
    "fmla v15.4s, v24.4s, v9.4s\n"
    "fmla v18.4s, v20.4s, v12.4s\n"
    "ldr s22, [x21, x27]\n"
    "fmla v23.4s, v25.4s, v4.4s\n"
    "ldr x21, [%[inptrs], 216]\n"
    "fmla v17.4s, v25.4s, v7.4s\n"
    "fmla v14.4s, v25.4s, v5.4s\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "fmla v2.4s, v25.4s, v8.4s\n"
    "fmla v3.4s, v25.4s, v6.4s\n"
    "fmla v15.4s, v25.4s, v11.4s\n"
    "str s23, [x23, x28]\n"
    "mov v21.16b, v13.16b\n"
    "fmla v18.4s, v25.4s, v9.4s\n"
    "ldr x23, [%[outptrs], 16]\n"
    "fmla v14.4s, v29.4s, v7.4s\n"
    "fmla v2.4s, v29.4s, v10.4s\n"
    "fmla v21.4s, v24.4s, v12.4s\n"
    "ldr s30, [x20, x27]\n"
    "fmla v3.4s, v29.4s, v8.4s\n"
    "ldr x20, [%[inptrs], 176]\n"
    "fmla v18.4s, v29.4s, v11.4s\n"
    "ldr s31, [x19, x27]\n"
    "fmla v0.4s, v28.4s, v5.4s\n"
    "ldr x19, [%[inptrs], 136]\n"
    "fmla v16.4s, v28.4s, v6.4s\n"
    "ldr s26, [x7, x27]\n"
    "fmla v3.4s, v27.4s, v10.4s\n"
    "ldr s23, [x22, x27]\n"
    "fmla v19.4s, v22.4s, v4.4s\n"
    "ldr x22, [%[inptrs], 264]\n"
    "fmla v0.4s, v22.4s, v7.4s\n"
    "ldr x7, [%[inptrs], 48]\n"
    "fmla v1.4s, v22.4s, v5.4s\n"
    "fmla v16.4s, v22.4s, v8.4s\n"
    "fmla v15.4s, v22.4s, v6.4s\n"
    "fmla v21.4s, v22.4s, v9.4s\n"
    "str s19, [x25, x28]\n"
    "mov v24.16b, v13.16b\n"
    "mov v20.16b, v13.16b\n"
    "ldr s27, [x21, x27]\n"
    "fmla v17.4s, v30.4s, v4.4s\n"
    "ldr x21, [%[inptrs], 224]\n"
    "fmla v24.4s, v25.4s, v12.4s\n"
    "ldr s28, [x20, x27]\n"
    "fmla v1.4s, v30.4s, v7.4s\n"
    "ldr x20, [%[inptrs], 184]\n"
    "fmla v2.4s, v30.4s, v5.4s\n"
    "ldr x25, [%[outptrs], 72]\n"
    "str s17, [x24, x28]\n"
    "fmla v16.4s, v30.4s, v10.4s\n"
    "fmla v15.4s, v30.4s, v8.4s\n"
    "ldr s22, [x19, x27]\n"
    "fmla v18.4s, v30.4s, v6.4s\n"
    "ldr x24, [%[outptrs], 48]\n"
    "fmla v21.4s, v30.4s, v11.4s\n"
    "ldr x19, [%[inptrs], 96]\n"
    "fmla v24.4s, v30.4s, v9.4s\n"
    "fmla v20.4s, v30.4s, v12.4s\n"
    "fmla v14.4s, v31.4s, v4.4s\n"
    "ldr s30, [x22, x27]\n"
    "fmla v2.4s, v31.4s, v7.4s\n"
    "ldr s19, [x21, x27]\n"
    "fmla v3.4s, v31.4s, v5.4s\n"
    "ldr x22, [%[inptrs], 272]\n"
    "fmla v15.4s, v31.4s, v10.4s\n"
    "ldr x21, [%[inptrs], 232]\n"
    "str s14, [x23, x28]\n"
    "fmla v18.4s, v31.4s, v8.4s\n"
    "fmla v24.4s, v31.4s, v11.4s\n"
    "ldr s31, [x20, x27]\n"
    "fmla v3.4s, v26.4s, v7.4s\n"
    "ldr s17, [x22, x27]\n"
    "fmla v0.4s, v23.4s, v4.4s\n"
    "ldr x22, [%[inptrs], 280]\n"
    "fmla v18.4s, v26.4s, v10.4s\n"
    "ldr s14, [x21, x27]\n"
    "fmla v16.4s, v23.4s, v5.4s\n"
    "ldr x23, [%[outptrs], 24]\n"
    "fmla v21.4s, v23.4s, v6.4s\n"
    "ldr s26, [x22, x27]\n"
    "str s0, [x26, x28]\n"
    "fmla v1.4s, v27.4s, v4.4s\n"
    "fmla v15.4s, v27.4s, v5.4s\n"
    "ldr s13, [%[wbptr]]\n"
    "fmla v16.4s, v27.4s, v7.4s\n"
    "ldr x26, [%[outptrs], 104]\n"
    "fmla v21.4s, v27.4s, v8.4s\n"
    "add x27, x27, #4\n"
    "str s1, [x25, x28]\n"
    "fmla v24.4s, v27.4s, v6.4s\n"
    "fmla v20.4s, v27.4s, v9.4s\n"
    "ldr s12, [%[wbptr], #4]\n"
    "fmla v2.4s, v28.4s, v4.4s\n"
    "ldr s29, [x17, x27]\n"
    "fmla v15.4s, v28.4s, v7.4s\n"
    "ldr s27, [x7, x27]\n"
    "fmla v18.4s, v28.4s, v5.4s\n"
    "ldr x25, [%[outptrs], 80]\n"
    "fmla v21.4s, v28.4s, v10.4s\n"
    "ldr x17, [%[inptrs], 8]\n"
    "str s2, [x24, x28]\n"
    "fmla v24.4s, v28.4s, v8.4s\n"
    "fmla v20.4s, v28.4s, v11.4s\n"
    "ldr s9, [%[wbptr], #16]\n"
    "fmla v3.4s, v22.4s, v4.4s\n"
    "ldr s28, [x17, x27]\n"
    "fmla v18.4s, v22.4s, v7.4s\n"
    "ldr s25, [x19, x27]\n"
    "fmla v24.4s, v22.4s, v10.4s\n"
    "ldr x24, [%[outptrs], 56]\n"
    "fmla v16.4s, v30.4s, v4.4s\n"
    "ldr s11, [%[wbptr], #8]\n"
    "str s3, [x23, x28]\n"
    "fmla v21.4s, v30.4s, v5.4s\n"
    "fmla v20.4s, v30.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 56]\n"
    "fmla v15.4s, v19.4s, v4.4s\n"
    "ldr x17, [%[inptrs], 16]\n"
    "str s16, [x26, x28]\n"
    "fmla v24.4s, v19.4s, v5.4s\n"
    "fmla v21.4s, v19.4s, v7.4s\n"
    "ldr s16, [x7, x27]\n"
    "fmla v20.4s, v19.4s, v8.4s\n"
    "ldr s6, [%[wbptr], #28]\n"
    "str s15, [x25, x28]\n"
    "fmla v18.4s, v31.4s, v4.4s\n"
    "fmla v24.4s, v31.4s, v7.4s\n"
    "ldr s15, [x17, x27]\n"
    "fmla v21.4s, v17.4s, v4.4s\n"
    "ldr x25, [%[outptrs], 88]\n"
    "fmla v20.4s, v31.4s, v10.4s\n"
    "ldr s8, [%[wbptr], #20]\n"
    "str s18, [x24, x28]\n"
    "mov v18.16b, v13.16b\n"
    "fmla v24.4s, v14.4s, v4.4s\n"
    "ldr x26, [%[outptrs], 112]\n"
    "mov v22.16b, v13.16b\n"
    "ldr x20, [%[inptrs], 144]\n"
    "str s21, [x26, x28]\n"
    "fmla v20.4s, v17.4s, v5.4s\n"
    "mov v23.16b, v13.16b\n"
    "ldr s10, [%[wbptr], #12]\n"
    "str s24, [x25, x28]\n"
    "mov v19.16b, v13.16b\n"
    "mov v17.16b, v13.16b\n"
    "ldr s21, [x20, x27]\n"
    "fmla v20.4s, v14.4s, v7.4s\n"
    "ldr s5, [%[wbptr], #32]\n"
    "mov v14.16b, v13.16b\n"
    "ldr x26, [%[outptrs], 120]\n"
    "mov v0.16b, v13.16b\n"
    "ldr x19, [%[inptrs], 104]\n"
    "mov v1.16b, v13.16b\n"
    "mov v2.16b, v13.16b\n"
    "fmla v20.4s, v26.4s, v4.4s\n"
    "ldr s7, [%[wbptr], #24]\n"
    "fmla v18.4s, v29.4s, v12.4s\n"
    "ldr s29, [x19, x27]\n"
    "str s20, [x26, x28]\n"
    "ldr s4, [%[wbptr], #36]\n"
    "add x28, x28, #4\n"
    "bne 5b\n"
    "6:\n"
    "mov v3.16b, v13.16b\n"
    "ldr x7, [%[inptrs], 64]\n"
    "fmla v18.4s, v27.4s, v9.4s\n"
    "ldr x17, [%[inptrs], 24]\n"
    "fmla v22.4s, v27.4s, v12.4s\n"
    "ldr s30, [x7, x27]\n"
    "fmla v23.4s, v28.4s, v12.4s\n"
    "ldr x21, [%[inptrs], 192]\n"
    "fmla v19.4s, v25.4s, v12.4s\n"
    "ldr x20, [%[inptrs], 152]\n"
    "fmla v18.4s, v28.4s, v11.4s\n"
    "ldr s24, [x17, x27]\n"
    "fmla v22.4s, v25.4s, v9.4s\n"
    "ldr x19, [%[inptrs], 112]\n"
    "fmla v23.4s, v16.4s, v9.4s\n"
    "ldr x7, [%[inptrs], 72]\n"
    "fmla v17.4s, v16.4s, v12.4s\n"
    "ldr x17, [%[inptrs], 32]\n"
    "fmla v18.4s, v25.4s, v6.4s\n"
    "ldr s31, [x21, x27]\n"
    "fmla v22.4s, v16.4s, v11.4s\n"
    "ldr x22, [%[inptrs], 240]\n"
    "fmla v23.4s, v15.4s, v11.4s\n"
    "ldr x21, [%[inptrs], 200]\n"
    "fmla v14.4s, v15.4s, v12.4s\n"
    "ldr x23, [%[outptrs], 0]\n"
    "fmla v18.4s, v16.4s, v8.4s\n"
    "ldr s25, [x20, x27]\n"
    "fmla v22.4s, v21.4s, v6.4s\n"
    "ldr x20, [%[inptrs], 160]\n"
    "fmla v19.4s, v21.4s, v9.4s\n"
    "ldr x24, [%[outptrs], 32]\n"
    "fmla v0.4s, v21.4s, v12.4s\n"
    "ldr s21, [x19, x27]\n"
    "fmla v18.4s, v15.4s, v10.4s\n"
    "ldr s20, [x7, x27]\n"
    "fmla v22.4s, v29.4s, v8.4s\n"
    "ldr x19, [%[inptrs], 120]\n"
    "fmla v23.4s, v29.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 80]\n"
    "fmla v19.4s, v29.4s, v11.4s\n"
    "ldr x25, [%[outptrs], 64]\n"
    "fmla v18.4s, v29.4s, v5.4s\n"
    "ldr x26, [%[outptrs], 96]\n"
    "fmla v17.4s, v29.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v1.4s, v29.4s, v12.4s\n"
    "ldr s26, [x17, x27]\n"
    "fmla v22.4s, v30.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v18.4s, v30.4s, v7.4s\n"
    "ldr x17, [%[inptrs], 40]\n"
    "fmla v23.4s, v30.4s, v8.4s\n"
    "fmla v17.4s, v30.4s, v11.4s\n"
    "fmla v14.4s, v30.4s, v9.4s\n"
    "fmla v2.4s, v30.4s, v12.4s\n"
    "mov v16.16b, v13.16b\n"
    "fmla v3.4s, v24.4s, v12.4s\n"
    "fmla v19.4s, v31.4s, v6.4s\n"
    "fmla v0.4s, v31.4s, v9.4s\n"
    "mov v15.16b, v13.16b\n"
    "fmla v23.4s, v24.4s, v10.4s\n"
    "fmla v14.4s, v24.4s, v11.4s\n"
    "ldr s27, [x22, x27]\n"
    "fmla v22.4s, v25.4s, v5.4s\n"
    "ldr x22, [%[inptrs], 248]\n"
    "fmla v19.4s, v25.4s, v8.4s\n"
    "fmla v17.4s, v25.4s, v6.4s\n"
    "fmla v0.4s, v25.4s, v11.4s\n"
    "fmla v1.4s, v25.4s, v9.4s\n"
    "fmla v16.4s, v25.4s, v12.4s\n"
    "ldr s30, [x21, x27]\n"
    "fmla v18.4s, v21.4s, v4.4s\n"
    "ldr x21, [%[inptrs], 208]\n"
    "fmla v22.4s, v21.4s, v7.4s\n"
    "fmla v23.4s, v21.4s, v5.4s\n"
    "fmla v19.4s, v21.4s, v10.4s\n"
    "fmla v17.4s, v21.4s, v8.4s\n"
    "fmla v14.4s, v21.4s, v6.4s\n"
    "fmla v1.4s, v21.4s, v11.4s\n"
    "str s18, [x23, x28]\n"
    "mov v18.16b, v13.16b\n"
    "fmla v2.4s, v21.4s, v9.4s\n"
    "ldr x23, [%[outptrs], 8]\n"
    "fmla v15.4s, v21.4s, v12.4s\n"
    "ldr s24, [x20, x27]\n"
    "fmla v23.4s, v20.4s, v7.4s\n"
    "ldr x20, [%[inptrs], 168]\n"
    "fmla v17.4s, v20.4s, v10.4s\n"
    "fmla v14.4s, v20.4s, v8.4s\n"
    "fmla v2.4s, v20.4s, v11.4s\n"
    "fmla v3.4s, v20.4s, v9.4s\n"
    "fmla v18.4s, v20.4s, v12.4s\n"
    "ldr s25, [x19, x27]\n"
    "fmla v0.4s, v27.4s, v6.4s\n"
    "ldr s29, [x7, x27]\n"
    "fmla v14.4s, v26.4s, v10.4s\n"
    "ldr x19, [%[inptrs], 128]\n"
    "fmla v3.4s, v26.4s, v11.4s\n"
    "ldr s27, [x17, x27]\n"
    "fmla v19.4s, v30.4s, v5.4s\n"
    "ldr x7, [%[inptrs], 88]\n"
    "fmla v0.4s, v30.4s, v8.4s\n"
    "fmla v1.4s, v30.4s, v6.4s\n"
    "fmla v16.4s, v30.4s, v9.4s\n"
    "ldr s28, [x22, x27]\n"
    "fmla v22.4s, v24.4s, v4.4s\n"
    "ldr x22, [%[inptrs], 256]\n"
    "fmla v19.4s, v24.4s, v7.4s\n"
    "fmla v17.4s, v24.4s, v5.4s\n"
    "fmla v0.4s, v24.4s, v10.4s\n"
    "fmla v1.4s, v24.4s, v8.4s\n"
    "fmla v2.4s, v24.4s, v6.4s\n"
    "fmla v16.4s, v24.4s, v11.4s\n"
    "str s22, [x24, x28]\n"
    "mov v21.16b, v13.16b\n"
    "fmla v15.4s, v24.4s, v9.4s\n"
    "ldr x24, [%[outptrs], 40]\n"
    "fmla v23.4s, v25.4s, v4.4s\n"
    "fmla v17.4s, v25.4s, v7.4s\n"
    "fmla v21.4s, v24.4s, v12.4s\n"
    "ldr s22, [x21, x27]\n"
    "fmla v14.4s, v25.4s, v5.4s\n"
    "ldr x21, [%[inptrs], 216]\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "fmla v2.4s, v25.4s, v8.4s\n"
    "str s23, [x23, x28]\n"
    "mov v24.16b, v13.16b\n"
    "mov v20.16b, v13.16b\n"
    "ldr x23, [%[outptrs], 16]\n"
    "fmla v3.4s, v25.4s, v6.4s\n"
    "fmla v15.4s, v25.4s, v11.4s\n"
    "fmla v18.4s, v25.4s, v9.4s\n"
    "fmla v24.4s, v25.4s, v12.4s\n"
    "fmla v14.4s, v29.4s, v7.4s\n"
    "ldr s30, [x20, x27]\n"
    "fmla v2.4s, v29.4s, v10.4s\n"
    "ldr x20, [%[inptrs], 176]\n"
    "fmla v3.4s, v29.4s, v8.4s\n"
    "fmla v0.4s, v28.4s, v5.4s\n"
    "fmla v18.4s, v29.4s, v11.4s\n"
    "ldr s31, [x19, x27]\n"
    "fmla v16.4s, v28.4s, v6.4s\n"
    "ldr s26, [x7, x27]\n"
    "fmla v19.4s, v22.4s, v4.4s\n"
    "ldr x19, [%[inptrs], 136]\n"
    "fmla v3.4s, v27.4s, v10.4s\n"
    "ldr s23, [x22, x27]\n"
    "fmla v0.4s, v22.4s, v7.4s\n"
    "ldr x22, [%[inptrs], 264]\n"
    "fmla v1.4s, v22.4s, v5.4s\n"
    "fmla v16.4s, v22.4s, v8.4s\n"
    "str s19, [x25, x28]\n"
    "fmla v15.4s, v22.4s, v6.4s\n"
    "fmla v21.4s, v22.4s, v9.4s\n"
    "ldr s27, [x21, x27]\n"
    "fmla v17.4s, v30.4s, v4.4s\n"
    "ldr s28, [x20, x27]\n"
    "fmla v1.4s, v30.4s, v7.4s\n"
    "ldr x21, [%[inptrs], 224]\n"
    "fmla v2.4s, v30.4s, v5.4s\n"
    "ldr x20, [%[inptrs], 184]\n"
    "fmla v16.4s, v30.4s, v10.4s\n"
    "ldr x25, [%[outptrs], 72]\n"
    "str s17, [x24, x28]\n"
    "fmla v15.4s, v30.4s, v8.4s\n"
    "fmla v18.4s, v30.4s, v6.4s\n"
    "ldr s22, [x19, x27]\n"
    "fmla v21.4s, v30.4s, v11.4s\n"
    "ldr x24, [%[outptrs], 48]\n"
    "fmla v24.4s, v30.4s, v9.4s\n"
    "fmla v20.4s, v30.4s, v12.4s\n"
    "fmla v14.4s, v31.4s, v4.4s\n"
    "ldr s30, [x22, x27]\n"
    "fmla v2.4s, v31.4s, v7.4s\n"
    "ldr s19, [x21, x27]\n"
    "fmla v3.4s, v31.4s, v5.4s\n"
    "ldr x22, [%[inptrs], 272]\n"
    "fmla v15.4s, v31.4s, v10.4s\n"
    "ldr x21, [%[inptrs], 232]\n"
    "str s14, [x23, x28]\n"
    "fmla v18.4s, v31.4s, v8.4s\n"
    "fmla v24.4s, v31.4s, v11.4s\n"
    "ldr s31, [x20, x27]\n"
    "fmla v3.4s, v26.4s, v7.4s\n"
    "ldr s17, [x22, x27]\n"
    "fmla v0.4s, v23.4s, v4.4s\n"
    "ldr x22, [%[inptrs], 280]\n"
    "fmla v18.4s, v26.4s, v10.4s\n"
    "ldr s14, [x21, x27]\n"
    "fmla v16.4s, v23.4s, v5.4s\n"
    "ldr x23, [%[outptrs], 24]\n"
    "fmla v21.4s, v23.4s, v6.4s\n"
    "ldr s26, [x22, x27]\n"
    "str s0, [x26, x28]\n"
    "fmla v1.4s, v27.4s, v4.4s\n"
    "fmla v15.4s, v27.4s, v5.4s\n"
    "ldr x26, [%[outptrs], 104]\n"
    "fmla v16.4s, v27.4s, v7.4s\n"
    "add x27, x27, #4\n"
    "fmla v21.4s, v27.4s, v8.4s\n"
    "fmla v24.4s, v27.4s, v6.4s\n"
    "str s1, [x25, x28]\n"
    "fmla v20.4s, v27.4s, v9.4s\n"
    "fmla v2.4s, v28.4s, v4.4s\n"
    "ldr x25, [%[outptrs], 80]\n"
    "fmla v15.4s, v28.4s, v7.4s\n"
    "fmla v18.4s, v28.4s, v5.4s\n"
    "fmla v21.4s, v28.4s, v10.4s\n"
    "fmla v24.4s, v28.4s, v8.4s\n"
    "fmla v20.4s, v28.4s, v11.4s\n"
    "fmla v3.4s, v22.4s, v4.4s\n"
    "str s2, [x24, x28]\n"
    "fmla v16.4s, v30.4s, v4.4s\n"
    "fmla v18.4s, v22.4s, v7.4s\n"
    "ldr x24, [%[outptrs], 56]\n"
    "fmla v24.4s, v22.4s, v10.4s\n"
    "fmla v21.4s, v30.4s, v5.4s\n"
    "str s3, [x23, x28]\n"
    "fmla v20.4s, v30.4s, v6.4s\n"
    "str s16, [x26, x28]\n"
    "fmla v15.4s, v19.4s, v4.4s\n"
    "fmla v18.4s, v31.4s, v4.4s\n"
    "ldr x26, [%[outptrs], 112]\n"
    "fmla v21.4s, v19.4s, v7.4s\n"
    "fmla v24.4s, v19.4s, v5.4s\n"
    "fmla v20.4s, v19.4s, v8.4s\n"
    "str s15, [x25, x28]\n"
    "str s18, [x24, x28]\n"
    "ldr x25, [%[outptrs], 88]\n"
    "fmla v24.4s, v31.4s, v7.4s\n"
    "fmla v21.4s, v17.4s, v4.4s\n"
    "fmla v20.4s, v31.4s, v10.4s\n"
    "str s21, [x26, x28]\n"
    "fmla v20.4s, v17.4s, v5.4s\n"
    "ldr x26, [%[outptrs], 120]\n"
    "fmla v24.4s, v14.4s, v4.4s\n"
    "fmla v20.4s, v14.4s, v7.4s\n"
    "str s24, [x25, x28]\n"
    "fmla v20.4s, v26.4s, v4.4s\n"
    "str s20, [x26, x28]\n"
    "add x28, x28, #4\n"
    "7:\n"
    : [wbptr] "+r" (weight_bias_ptr)
    : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
  );
}

template <>
template <>
void Conv::execute_tile<ActivationFunction::ReLU>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *input,
  const unsigned int input_row_stride,
  const unsigned int input_col_stride,
  float *output,
  const unsigned int output_row_stride,
  const unsigned int output_col_stride
)
{
  __asm __volatile(
    "add x9, %[inptr0], %[input_row_stride]\n"
    "add x28, %[input_col_stride1], %[input_col_stride1]\n"
    "add x16, %[outptr0], %[output_row_stride]\n"
    "add x24, x9, %[input_row_stride]\n"
    "add x25, x28, #64\n"
    "add x23, x28, %[input_col_stride1]\n"
    "add x26, x24, %[input_row_stride]\n"
    "add x11, x23, #64\n"
    "add x12, x23, %[input_col_stride1]\n"
    "add x10, x26, %[input_row_stride]\n"
    "add x13, x12, #64\n"
    "add x14, x12, %[input_col_stride1]\n"
    "add x27, x10, %[input_row_stride]\n"
    "add x15, x14, #64\n"
    "add x17, x16, %[output_row_stride]\n"
    "add x7, x17, %[output_row_stride]\n"
    "add x19, %[output_col_stride1], %[output_col_stride1]\n"
    "and x21, %[n_channels], #3\n"
    "add x20, x19, %[output_col_stride1]\n"
    "lsr x22, %[n_channels], #2\n"
    "cbz x22, 4f\n"
    "1:\n"
    "ldr q21, [%[wbptr]]\n"
    "subs x22, x22, #1\n"
    "mov v7.16b, v21.16b\n"
    "ldr q20, [%[wbptr], #16]\n"
    "mov v3.16b, v21.16b\n"
    "ldr q14, [%[wbptr], #32]\n"
    "mov v6.16b, v21.16b\n"
    "ldr q13, [%[wbptr], #48]\n"
    "mov v15.16b, v21.16b\n"
    "ldr q17, [%[wbptr], #64]\n"
    "mov v2.16b, v21.16b\n"
    "ldr q12, [%[wbptr], #80]\n"
    "mov v5.16b, v21.16b\n"
    "ldr q11, [%[wbptr], #96]\n"
    "mov v0.16b, v21.16b\n"
    "ldr q10, [%[wbptr], #112]\n"
    "mov v16.16b, v21.16b\n"
    "ldr q9, [%[wbptr], #128]\n"
    "mov v1.16b, v21.16b\n"
    "ldr q8, [%[wbptr], #144]\n"
    "mov v4.16b, v21.16b\n"
    "ldr q22, [%[inptr0]]\n"
    "fmla v7.4s, v22.4s, v20.4s\n"
    "ldr q19, [x9]\n"
    "fmla v3.4s, v19.4s, v20.4s\n"
    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v6.4s, v23.4s, v20.4s\n"
    "ldr q18, [x24]\n"
    "fmla v7.4s, v19.4s, v17.4s\n"
    "ldr q27, [x9, %[input_col_stride1]]\n"
    "fmla v3.4s, v18.4s, v17.4s\n"
    "ldr q28, [%[inptr0], x28]\n"
    "fmla v15.4s, v18.4s, v20.4s\n"
    "ldr q25, [x26]\n"
    "fmla v7.4s, v23.4s, v14.4s\n"
    "ldr q22, [x24, %[input_col_stride1]]\n"
    "fmla v3.4s, v27.4s, v14.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x9, #64]\n"
    "prfm pldl1keep, [%[inptr0], x8]\n"
    "fmla v7.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x24, #64]\n"
    "prfm pldl1keep, [x9, x8]\n"
    "prfm pldl1keep, [%[inptr0], x25]\n"
    "prfm pldl1keep, [x26, #64]\n"
    "prfm pldl1keep, [x24, x8]\n"
    "fmla v7.4s, v27.4s, v12.4s\n"
    "beq 3f\n"
    "2:\n"
    "mov v18.16b, v21.16b\n"
    "ldr q23, [x9, x28]\n"
    "mov v19.16b, v21.16b\n"
    "prfm pldl1keep, [x9, x25]\n"
    "fmla v6.4s, v27.4s, v17.4s\n"
    "prfm pldl1keep, [%[inptr0], x11]\n"
    "fmla v2.4s, v27.4s, v20.4s\n"
    "ldr q24, [%[inptr0], x23]\n"
    "fmla v7.4s, v28.4s, v13.4s\n"
    "prfm pldl1keep, [x10, #64]\n"
    "fmla v6.4s, v28.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x8]\n"
    "fmla v5.4s, v28.4s, v20.4s\n"
    "ldr q26, [x10]\n"
    "fmla v3.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x25]\n"
    "fmla v15.4s, v25.4s, v17.4s\n"
    "prfm pldl1keep, [x9, x11]\n"
    "fmla v0.4s, v25.4s, v20.4s\n"
    "ldr q25, [x26, %[input_col_stride1]]\n"
    "fmla v7.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [%[inptr0], x13]\n"
    "fmla v3.4s, v22.4s, v12.4s\n"
    "prfm pldl1keep, [x27, #64]\n"
    "fmla v6.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [x10, x8]\n"
    "fmla v15.4s, v22.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x25]\n"
    "fmla v2.4s, v22.4s, v17.4s\n"
    "prfm pldl1keep, [x24, x11]\n"
    "fmla v16.4s, v22.4s, v20.4s\n"
    "ldr q22, [x24, x28]\n"
    "fmla v7.4s, v23.4s, v11.4s\n"
    "prfm pldl1keep, [x9, x13]\n"
    "fmla v3.4s, v23.4s, v13.4s\n"
    "prfm pldl1keep, [%[inptr0], x15]\n"
    "fmla v6.4s, v23.4s, v12.4s\n"
    "prfm pldl1keep, [x27, x8]\n"
    "fmla v2.4s, v23.4s, v14.4s\n"
    "prfm pldl1keep, [x10, x25]\n"
    "fmla v5.4s, v23.4s, v17.4s\n"
    "prfm pldl1keep, [x26, x11]\n"
    "fmla v1.4s, v23.4s, v20.4s\n"
    "ldr q23, [x9, x23]\n"
    "fmla v6.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [x24, x13]\n"
    "fmla v5.4s, v24.4s, v14.4s\n"
    "prfm pldl1keep, [x9, x15]\n"
    "fmla v4.4s, v24.4s, v20.4s\n"
    "ldr q24, [%[inptr0], x12]\n"
    "fmla v15.4s, v26.4s, v10.4s\n"
    "prfm pldl1keep, [x27, x25]\n"
    "fmla v0.4s, v26.4s, v17.4s\n"
    "ldr q29, [x27]\n"
    "fmla v3.4s, v25.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x11]\n"
    "fmla v15.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x26, x13]\n"
    "fmla v2.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x15]\n"
    "fmla v0.4s, v25.4s, v14.4s\n"
    "prfm pldl1keep, [x27, x11]\n"
    "fmla v16.4s, v25.4s, v17.4s\n"
    "prfm pldl1keep, [x10, x13]\n"
    "fmla v18.4s, v25.4s, v20.4s\n"
    "ldr q26, [x10, %[input_col_stride1]]\n"
    "fmla v7.4s, v22.4s, v8.4s\n"
    "prfm pldl1keep, [x26, x15]\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "prfm pldl1keep, [x27, x13]\n"
    "fmla v6.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x15]\n"
    "fmla v15.4s, v22.4s, v13.4s\n"
    "prfm pldl1keep, [x27, x15]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v5.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v16.4s, v22.4s, v14.4s\n"
    "subs x22, x22, #1\n"
    "fmla v1.4s, v22.4s, v17.4s\n"
    "fmla v19.4s, v22.4s, v20.4s\n"
    "mov v22.16b, v21.16b\n"
    "fmla v6.4s, v23.4s, v11.4s\n"
    "fmla v2.4s, v23.4s, v13.4s\n"
    "fmla v5.4s, v23.4s, v12.4s\n"
    "fmla v1.4s, v23.4s, v14.4s\n"
    "fmla v4.4s, v23.4s, v17.4s\n"
    "fmla v22.4s, v23.4s, v20.4s\n"
    "ldr q27, [x26, x28]\n"
    "fmla v5.4s, v24.4s, v13.4s\n"
    "fmla v0.4s, v29.4s, v10.4s\n"
    "mov v23.16b, v21.16b\n"
    "fmla v4.4s, v24.4s, v14.4s\n"
    "mov v25.16b, v21.16b\n"
    "mov v24.16b, v21.16b\n"
    "fmla v15.4s, v26.4s, v9.4s\n"
    "fmla v0.4s, v26.4s, v12.4s\n"
    "fmla v16.4s, v26.4s, v10.4s\n"
    "fmla v18.4s, v26.4s, v17.4s\n"
    "fmla v3.4s, v27.4s, v8.4s\n"
    "ldr q29, [x24, x23]\n"
    "fmla v15.4s, v27.4s, v11.4s\n"
    "fmla v2.4s, v27.4s, v9.4s\n"
    "fmla v0.4s, v27.4s, v13.4s\n"
    "fmla v16.4s, v27.4s, v12.4s\n"
    "fmla v1.4s, v27.4s, v10.4s\n"
    "fmla v18.4s, v27.4s, v14.4s\n"
    "fmla v19.4s, v27.4s, v17.4s\n"
    "fmla v23.4s, v27.4s, v20.4s\n"
    "fmla v6.4s, v29.4s, v8.4s\n"
    "ldr q28, [x9, x12]\n"
    "fmla v2.4s, v29.4s, v11.4s\n"
    "fmla v5.4s, v29.4s, v9.4s\n"
    "fmla v16.4s, v29.4s, v13.4s\n"
    "fmla v1.4s, v29.4s, v12.4s\n"
    "fmla v4.4s, v29.4s, v10.4s\n"
    "fmla v19.4s, v29.4s, v14.4s\n"
    "fmla v22.4s, v29.4s, v17.4s\n"
    "fmla v25.4s, v29.4s, v20.4s\n"
    "fmla v5.4s, v28.4s, v11.4s\n"
    "ldr q21, [%[inptr0], x14]\n"
    "fmla v1.4s, v28.4s, v13.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v4.4s, v28.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v22.4s, v28.4s, v14.4s\n"
    "ldr q26, [x27, %[input_col_stride1]]\n"
    "fmla v0.4s, v26.4s, v9.4s\n"
    "prfm pldl1keep, [%[inptr0], x8]\n"
    "fmla v4.4s, v21.4s, v13.4s\n"
    "ldr q21, [x10, x28]\n"
    "fmla v18.4s, v26.4s, v10.4s\n"
    "ldr q29, [x26, x23]\n"
    "fmla v15.4s, v21.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x25]\n"
    "fmla v0.4s, v21.4s, v11.4s\n"
    "fmla v16.4s, v21.4s, v9.4s\n"
    "fmla v18.4s, v21.4s, v12.4s\n"
    "fmla v19.4s, v21.4s, v10.4s\n"
    "fmla v23.4s, v21.4s, v17.4s\n"
    "ldr q21, [x24, x12]\n"
    "fmla v2.4s, v29.4s, v8.4s\n"
    "fmla v16.4s, v29.4s, v11.4s\n"
    "fmla v1.4s, v29.4s, v9.4s\n"
    "fmla v18.4s, v29.4s, v13.4s\n"
    "fmla v19.4s, v29.4s, v12.4s\n"
    "fmla v22.4s, v29.4s, v10.4s\n"
    "fmla v23.4s, v29.4s, v14.4s\n"
    "fmla v25.4s, v29.4s, v17.4s\n"
    "fmla v24.4s, v29.4s, v20.4s\n"
    "ldr q28, [x9, x14]\n"
    "fmla v5.4s, v21.4s, v8.4s\n"
    "ldr q27, [x27, x28]\n"
    "fmla v1.4s, v21.4s, v11.4s\n"
    "add x9, x9, #16\n"
    "fmla v4.4s, v21.4s, v9.4s\n"
    "prfm pldl1keep, [x9, #64]\n"
    "fmla v19.4s, v21.4s, v13.4s\n"
    "prfm pldl1keep, [x9, x8]\n"
    "fmla v22.4s, v21.4s, v12.4s\n"
    "fmla v25.4s, v21.4s, v14.4s\n"
    "fmla v4.4s, v28.4s, v11.4s\n"
    "ldr q20, [x10, x23]\n"
    "fmla v0.4s, v27.4s, v8.4s\n"
    "fmla v18.4s, v27.4s, v9.4s\n"
    "fmla v22.4s, v28.4s, v13.4s\n"
    "ldr q26, [x26, x12]\n"
    "fmla v23.4s, v27.4s, v10.4s\n"
    "ldr q21, [x24, x14]\n"
    "fmla v16.4s, v20.4s, v8.4s\n"
    "add x24, x24, #16\n"
    "fmla v18.4s, v20.4s, v11.4s\n"
    "prfm pldl1keep, [x24, #64]\n"
    "fmla v19.4s, v20.4s, v9.4s\n"
    "prfm pldl1keep, [x24, x8]\n"
    "fmla v23.4s, v20.4s, v12.4s\n"
    "fmla v25.4s, v20.4s, v10.4s\n"
    "fmla v24.4s, v20.4s, v17.4s\n"
    "ldr q28, [x27, x23]\n"
    "fmla v1.4s, v26.4s, v8.4s\n"
    "ldr q20, [x10, x12]\n"
    "fmla v19.4s, v26.4s, v11.4s\n"
    "fmla v22.4s, v26.4s, v9.4s\n"
    "fmla v23.4s, v26.4s, v13.4s\n"
    "fmla v25.4s, v26.4s, v12.4s\n"
    "fmla v24.4s, v26.4s, v14.4s\n"
    "ldr q17, [x26, x14]\n"
    "fmla v4.4s, v21.4s, v8.4s\n"
    "ldr q26, [x27, x12]\n"
    "fmla v22.4s, v21.4s, v11.4s\n"
    "add x26, x26, #16\n"
    "fmla v25.4s, v21.4s, v13.4s\n"
    "ldr q27, [x10, x14]\n"
    "fmla v18.4s, v28.4s, v8.4s\n"
    "prfm pldl1keep, [x26, #64]\n"
    "fmla v23.4s, v28.4s, v9.4s\n"
    "add x10, x10, #16\n"
    "fmla v24.4s, v28.4s, v10.4s\n"
    "ldr q28, [x27, x14]\n"
    "fmla v19.4s, v20.4s, v8.4s\n"
    "ldr q21, [%[wbptr]]\n"
    "fmla v23.4s, v20.4s, v11.4s\n"
    "add x27, x27, #16\n"
    "fmla v25.4s, v20.4s, v9.4s\n"
    "fmla v24.4s, v20.4s, v12.4s\n"
    "fmla v22.4s, v17.4s, v8.4s\n"
    "ldr q20, [%[wbptr], #16]\n"
    "fmla v23.4s, v26.4s, v8.4s\n"
    "ldr q14, [%[wbptr], #32]\n"
    "fmla v24.4s, v17.4s, v13.4s\n"
    "movi v29.16b, #0\n"
    "fmla v25.4s, v17.4s, v11.4s\n"
    "ldr q17, [%[wbptr], #64]\n"
    "fmax v7.4s, v7.4s, v29.4s\n"
    "fmax v6.4s, v6.4s, v29.4s\n"
    "fmla v24.4s, v26.4s, v9.4s\n"
    "ldr q13, [%[wbptr], #48]\n"
    "str q7, [%[outptr0]]\n"
    "fmla v25.4s, v27.4s, v8.4s\n"
    "str q6, [%[outptr0], %[output_col_stride1]]\n"
    "fmax v5.4s, v5.4s, v29.4s\n"
    "fmla v24.4s, v27.4s, v11.4s\n"
    "ldr q12, [%[wbptr], #80]\n"
    "str q5, [%[outptr0], x19]\n"
    "fmax v4.4s, v4.4s, v29.4s\n"
    "fmax v3.4s, v3.4s, v29.4s\n"
    "ldr q10, [%[wbptr], #112]\n"
    "str q4, [%[outptr0], x20]\n"
    "fmla v24.4s, v28.4s, v8.4s\n"
    "str q3, [x16]\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "ldr q11, [%[wbptr], #96]\n"
    "str q2, [x16, %[output_col_stride1]]\n"
    "fmax v22.4s, v22.4s, v29.4s\n"
    "str q1, [x16, x19]\n"
    "fmax v15.4s, v15.4s, v29.4s\n"
    "str q22, [x16, x20]\n"
    "fmax v16.4s, v16.4s, v29.4s\n"
    "str q15, [x17]\n"
    "fmax v19.4s, v19.4s, v29.4s\n"
    "str q16, [x17, %[output_col_stride1]]\n"
    "fmax v25.4s, v25.4s, v29.4s\n"
    "str q19, [x17, x19]\n"
    "fmax v0.4s, v0.4s, v29.4s\n"
    "str q25, [x17, x20]\n"
    "fmax v18.4s, v18.4s, v29.4s\n"
    "str q0, [x7]\n"
    "fmax v23.4s, v23.4s, v29.4s\n"
    "str q18, [x7, %[output_col_stride1]]\n"
    "fmax v24.4s, v24.4s, v29.4s\n"
    "str q23, [x7, x19]\n"
    "mov v7.16b, v21.16b\n"
    "str q24, [x7, x20]\n"
    "mov v3.16b, v21.16b\n"
    "mov v6.16b, v21.16b\n"
    "ldr q9, [%[wbptr], #128]\n"
    "mov v15.16b, v21.16b\n"
    "ldr q8, [%[wbptr], #144]\n"
    "mov v2.16b, v21.16b\n"
    "ldr q22, [%[inptr0]]\n"
    "mov v5.16b, v21.16b\n"
    "ldr q19, [x9]\n"
    "mov v0.16b, v21.16b\n"
    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
    "mov v16.16b, v21.16b\n"
    "ldr q18, [x24]\n"
    "mov v1.16b, v21.16b\n"
    "ldr q27, [x9, %[input_col_stride1]]\n"
    "mov v4.16b, v21.16b\n"
    "ldr q28, [%[inptr0], x28]\n"
    "fmla v7.4s, v22.4s, v20.4s\n"
    "ldr q25, [x26]\n"
    "fmla v3.4s, v19.4s, v20.4s\n"
    "ldr q22, [x24, %[input_col_stride1]]\n"
    "fmla v6.4s, v23.4s, v20.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmla v7.4s, v19.4s, v17.4s\n"
    "add x16, x16, #16\n"
    "fmla v3.4s, v18.4s, v17.4s\n"
    "add x17, x17, #16\n"
    "fmla v15.4s, v18.4s, v20.4s\n"
    "add x7, x7, #16\n"
    "fmla v7.4s, v23.4s, v14.4s\n"
    "fmla v3.4s, v27.4s, v14.4s\n"
    "fmla v7.4s, v18.4s, v10.4s\n"
    "fmla v7.4s, v27.4s, v12.4s\n"
    "bne 2b\n"
    "3:\n"
    "mov v18.16b, v21.16b\n"
    "ldr q23, [x9, x28]\n"
    "mov v19.16b, v21.16b\n"
    "prfm pldl1keep, [x9, x25]\n"
    "fmla v6.4s, v27.4s, v17.4s\n"
    "prfm pldl1keep, [%[inptr0], x11]\n"
    "fmla v2.4s, v27.4s, v20.4s\n"
    "ldr q24, [%[inptr0], x23]\n"
    "fmla v7.4s, v28.4s, v13.4s\n"
    "prfm pldl1keep, [x10, #64]\n"
    "fmla v6.4s, v28.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x8]\n"
    "fmla v5.4s, v28.4s, v20.4s\n"
    "ldr q26, [x10]\n"
    "fmla v3.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x25]\n"
    "fmla v15.4s, v25.4s, v17.4s\n"
    "prfm pldl1keep, [x9, x11]\n"
    "fmla v0.4s, v25.4s, v20.4s\n"
    "ldr q25, [x26, %[input_col_stride1]]\n"
    "fmla v7.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [%[inptr0], x13]\n"
    "fmla v3.4s, v22.4s, v12.4s\n"
    "prfm pldl1keep, [x27, #64]\n"
    "fmla v6.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [x10, x8]\n"
    "fmla v15.4s, v22.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x25]\n"
    "fmla v2.4s, v22.4s, v17.4s\n"
    "prfm pldl1keep, [x24, x11]\n"
    "fmla v16.4s, v22.4s, v20.4s\n"
    "ldr q22, [x24, x28]\n"
    "fmla v7.4s, v23.4s, v11.4s\n"
    "prfm pldl1keep, [x9, x13]\n"
    "fmla v3.4s, v23.4s, v13.4s\n"
    "prfm pldl1keep, [%[inptr0], x15]\n"
    "fmla v6.4s, v23.4s, v12.4s\n"
    "prfm pldl1keep, [x27, x8]\n"
    "fmla v2.4s, v23.4s, v14.4s\n"
    "prfm pldl1keep, [x10, x25]\n"
    "fmla v5.4s, v23.4s, v17.4s\n"
    "prfm pldl1keep, [x26, x11]\n"
    "fmla v1.4s, v23.4s, v20.4s\n"
    "ldr q23, [x9, x23]\n"
    "fmla v6.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [x24, x13]\n"
    "fmla v5.4s, v24.4s, v14.4s\n"
    "prfm pldl1keep, [x9, x15]\n"
    "fmla v4.4s, v24.4s, v20.4s\n"
    "ldr q24, [%[inptr0], x12]\n"
    "fmla v15.4s, v26.4s, v10.4s\n"
    "prfm pldl1keep, [x27, x25]\n"
    "fmla v0.4s, v26.4s, v17.4s\n"
    "ldr q29, [x27]\n"
    "fmla v3.4s, v25.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x11]\n"
    "fmla v15.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x26, x13]\n"
    "fmla v2.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x15]\n"
    "fmla v0.4s, v25.4s, v14.4s\n"
    "prfm pldl1keep, [x27, x11]\n"
    "fmla v16.4s, v25.4s, v17.4s\n"
    "prfm pldl1keep, [x10, x13]\n"
    "fmla v18.4s, v25.4s, v20.4s\n"
    "ldr q26, [x10, %[input_col_stride1]]\n"
    "fmla v7.4s, v22.4s, v8.4s\n"
    "prfm pldl1keep, [x26, x15]\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "prfm pldl1keep, [x27, x13]\n"
    "fmla v6.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x15]\n"
    "fmla v15.4s, v22.4s, v13.4s\n"
    "prfm pldl1keep, [x27, x15]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v5.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v16.4s, v22.4s, v14.4s\n"
    "fmla v1.4s, v22.4s, v17.4s\n"
    "fmla v19.4s, v22.4s, v20.4s\n"
    "ldr q27, [x26, x28]\n"
    "fmla v6.4s, v23.4s, v11.4s\n"
    "fmla v2.4s, v23.4s, v13.4s\n"
    "fmla v5.4s, v23.4s, v12.4s\n"
    "fmla v1.4s, v23.4s, v14.4s\n"
    "fmla v4.4s, v23.4s, v17.4s\n"
    "fmla v0.4s, v29.4s, v10.4s\n"
    "mov v22.16b, v21.16b\n"
    "fmla v15.4s, v26.4s, v9.4s\n"
    "fmla v5.4s, v24.4s, v13.4s\n"
    "fmla v16.4s, v26.4s, v10.4s\n"
    "fmla v22.4s, v23.4s, v20.4s\n"
    "ldr q29, [x24, x23]\n"
    "fmla v4.4s, v24.4s, v14.4s\n"
    "ldr q28, [x9, x12]\n"
    "fmla v0.4s, v26.4s, v12.4s\n"
    "fmla v18.4s, v26.4s, v17.4s\n"
    "mov v23.16b, v21.16b\n"
    "fmla v3.4s, v27.4s, v8.4s\n"
    "fmla v15.4s, v27.4s, v11.4s\n"
    "fmla v2.4s, v27.4s, v9.4s\n"
    "fmla v0.4s, v27.4s, v13.4s\n"
    "fmla v16.4s, v27.4s, v12.4s\n"
    "fmla v1.4s, v27.4s, v10.4s\n"
    "fmla v18.4s, v27.4s, v14.4s\n"
    "fmla v19.4s, v27.4s, v17.4s\n"
    "fmla v23.4s, v27.4s, v20.4s\n"
    "mov v25.16b, v21.16b\n"
    "mov v24.16b, v21.16b\n"
    "fmla v6.4s, v29.4s, v8.4s\n"
    "fmla v2.4s, v29.4s, v11.4s\n"
    "fmla v5.4s, v29.4s, v9.4s\n"
    "fmla v16.4s, v29.4s, v13.4s\n"
    "fmla v1.4s, v29.4s, v12.4s\n"
    "fmla v4.4s, v29.4s, v10.4s\n"
    "fmla v19.4s, v29.4s, v14.4s\n"
    "fmla v22.4s, v29.4s, v17.4s\n"
    "fmla v25.4s, v29.4s, v20.4s\n"
    "ldr q21, [%[inptr0], x14]\n"
    "fmla v5.4s, v28.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v1.4s, v28.4s, v13.4s\n"
    "fmla v4.4s, v28.4s, v12.4s\n"
    "fmla v22.4s, v28.4s, v14.4s\n"
    "ldr q26, [x27, %[input_col_stride1]]\n"
    "fmla v0.4s, v26.4s, v9.4s\n"
    "fmla v18.4s, v26.4s, v10.4s\n"
    "fmla v4.4s, v21.4s, v13.4s\n"
    "ldr q21, [x10, x28]\n"
    "fmla v15.4s, v21.4s, v8.4s\n"
    "ldr q29, [x26, x23]\n"
    "fmla v0.4s, v21.4s, v11.4s\n"
    "fmla v16.4s, v21.4s, v9.4s\n"
    "fmla v18.4s, v21.4s, v12.4s\n"
    "fmla v19.4s, v21.4s, v10.4s\n"
    "fmla v23.4s, v21.4s, v17.4s\n"
    "ldr q21, [x24, x12]\n"
    "fmla v2.4s, v29.4s, v8.4s\n"
    "fmla v16.4s, v29.4s, v11.4s\n"
    "fmla v1.4s, v29.4s, v9.4s\n"
    "fmla v18.4s, v29.4s, v13.4s\n"
    "fmla v19.4s, v29.4s, v12.4s\n"
    "fmla v22.4s, v29.4s, v10.4s\n"
    "fmla v23.4s, v29.4s, v14.4s\n"
    "fmla v25.4s, v29.4s, v17.4s\n"
    "fmla v24.4s, v29.4s, v20.4s\n"
    "ldr q28, [x9, x14]\n"
    "fmla v5.4s, v21.4s, v8.4s\n"
    "ldr q27, [x27, x28]\n"
    "fmla v1.4s, v21.4s, v11.4s\n"
    "add x9, x9, #16\n"
    "fmla v4.4s, v21.4s, v9.4s\n"
    "fmla v19.4s, v21.4s, v13.4s\n"
    "fmla v22.4s, v21.4s, v12.4s\n"
    "fmla v25.4s, v21.4s, v14.4s\n"
    "fmla v0.4s, v27.4s, v8.4s\n"
    "ldr q20, [x10, x23]\n"
    "fmla v4.4s, v28.4s, v11.4s\n"
    "fmla v18.4s, v27.4s, v9.4s\n"
    "fmla v22.4s, v28.4s, v13.4s\n"
    "ldr q26, [x26, x12]\n"
    "fmla v23.4s, v27.4s, v10.4s\n"
    "ldr q21, [x24, x14]\n"
    "fmla v16.4s, v20.4s, v8.4s\n"
    "add x24, x24, #16\n"
    "fmla v18.4s, v20.4s, v11.4s\n"
    "fmla v19.4s, v20.4s, v9.4s\n"
    "fmla v23.4s, v20.4s, v12.4s\n"
    "fmla v25.4s, v20.4s, v10.4s\n"
    "fmla v24.4s, v20.4s, v17.4s\n"
    "ldr q28, [x27, x23]\n"
    "fmla v1.4s, v26.4s, v8.4s\n"
    "ldr q20, [x10, x12]\n"
    "fmla v19.4s, v26.4s, v11.4s\n"
    "fmla v22.4s, v26.4s, v9.4s\n"
    "fmla v23.4s, v26.4s, v13.4s\n"
    "fmla v25.4s, v26.4s, v12.4s\n"
    "fmla v24.4s, v26.4s, v14.4s\n"
    "ldr q17, [x26, x14]\n"
    "fmla v4.4s, v21.4s, v8.4s\n"
    "ldr q26, [x27, x12]\n"
    "fmla v22.4s, v21.4s, v11.4s\n"
    "add x26, x26, #16\n"
    "fmla v25.4s, v21.4s, v13.4s\n"
    "ldr q27, [x10, x14]\n"
    "fmla v18.4s, v28.4s, v8.4s\n"
    "add x10, x10, #16\n"
    "fmla v23.4s, v28.4s, v9.4s\n"
    "fmla v24.4s, v28.4s, v10.4s\n"
    "fmla v19.4s, v20.4s, v8.4s\n"
    "ldr q28, [x27, x14]\n"
    "fmla v25.4s, v20.4s, v9.4s\n"
    "add x27, x27, #16\n"
    "fmla v23.4s, v20.4s, v11.4s\n"
    "fmla v24.4s, v20.4s, v12.4s\n"
    "fmla v22.4s, v17.4s, v8.4s\n"
    "movi v29.16b, #0\n"
    "fmla v25.4s, v17.4s, v11.4s\n"
    "fmla v24.4s, v17.4s, v13.4s\n"
    "fmla v23.4s, v26.4s, v8.4s\n"
    "fmax v7.4s, v7.4s, v29.4s\n"
    "fmla v25.4s, v27.4s, v8.4s\n"
    "fmax v6.4s, v6.4s, v29.4s\n"
    "str q7, [%[outptr0]]\n"
    "fmla v24.4s, v26.4s, v9.4s\n"
    "str q6, [%[outptr0], %[output_col_stride1]]\n"
    "fmax v5.4s, v5.4s, v29.4s\n"
    "fmax v4.4s, v4.4s, v29.4s\n"
    "fmax v3.4s, v3.4s, v29.4s\n"
    "str q5, [%[outptr0], x19]\n"
    "fmla v24.4s, v27.4s, v11.4s\n"
    "str q4, [%[outptr0], x20]\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "str q3, [x16]\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "str q2, [x16, %[output_col_stride1]]\n"
    "fmla v24.4s, v28.4s, v8.4s\n"
    "str q1, [x16, x19]\n"
    "fmax v22.4s, v22.4s, v29.4s\n"
    "fmax v15.4s, v15.4s, v29.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "str q22, [x16, x20]\n"
    "fmax v16.4s, v16.4s, v29.4s\n"
    "str q15, [x17]\n"
    "fmax v19.4s, v19.4s, v29.4s\n"
    "str q16, [x17, %[output_col_stride1]]\n"
    "fmax v25.4s, v25.4s, v29.4s\n"
    "str q19, [x17, x19]\n"
    "fmax v0.4s, v0.4s, v29.4s\n"
    "str q25, [x17, x20]\n"
    "fmax v18.4s, v18.4s, v29.4s\n"
    "str q0, [x7]\n"
    "fmax v23.4s, v23.4s, v29.4s\n"
    "str q18, [x7, %[output_col_stride1]]\n"
    "fmax v24.4s, v24.4s, v29.4s\n"
    "str q23, [x7, x19]\n"
    "add x16, x16, #16\n"
    "str q24, [x7, x20]\n"
    "add x17, x17, #16\n"
    "add x7, x7, #16\n"
    "4:\n"
    "cbz x21, 7f\n"
    "ldr s21, [%[wbptr]]\n"
    "mov v7.16b, v21.16b\n"
    "ldr s20, [%[wbptr], #4]\n"
    "mov v3.16b, v21.16b\n"
    "ldr s14, [%[wbptr], #8]\n"
    "mov v6.16b, v21.16b\n"
    "ldr s13, [%[wbptr], #12]\n"
    "mov v15.16b, v21.16b\n"
    "ldr s17, [%[wbptr], #16]\n"
    "mov v2.16b, v21.16b\n"
    "ldr s12, [%[wbptr], #20]\n"
    "mov v5.16b, v21.16b\n"
    "ldr s11, [%[wbptr], #24]\n"
    "mov v0.16b, v21.16b\n"
    "ldr s10, [%[wbptr], #28]\n"
    "mov v16.16b, v21.16b\n"
    "ldr s9, [%[wbptr], #32]\n"
    "mov v1.16b, v21.16b\n"
    "ldr s8, [%[wbptr], #36]\n"
    "mov v4.16b, v21.16b\n"
    "ldr s22, [%[inptr0]]\n"
    "fmla v7.4s, v22.4s, v20.4s\n"
    "ldr s19, [x9]\n"
    "fmla v3.4s, v19.4s, v20.4s\n"
    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v6.4s, v23.4s, v20.4s\n"
    "ldr s18, [x24]\n"
    "fmla v7.4s, v19.4s, v17.4s\n"
    "ldr s27, [x9, %[input_col_stride1]]\n"
    "fmla v3.4s, v18.4s, v17.4s\n"
    "ldr s28, [%[inptr0], x28]\n"
    "fmla v15.4s, v18.4s, v20.4s\n"
    "ldr s25, [x26]\n"
    "fmla v7.4s, v23.4s, v14.4s\n"
    "ldr s22, [x24, %[input_col_stride1]]\n"
    "fmla v3.4s, v27.4s, v14.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x9, #64]\n"
    "subs x21, x21, #1\n"
    "prfm pldl1keep, [%[inptr0], x8]\n"
    "prfm pldl1keep, [x24, #64]\n"
    "fmla v7.4s, v18.4s, v10.4s\n"
    "prfm pldl1keep, [x9, x8]\n"
    "prfm pldl1keep, [%[inptr0], x25]\n"
    "prfm pldl1keep, [x26, #64]\n"
    "prfm pldl1keep, [x24, x8]\n"
    "fmla v7.4s, v27.4s, v12.4s\n"
    "beq 6f\n"
    "5:\n"
    "mov v18.16b, v21.16b\n"
    "ldr s23, [x9, x28]\n"
    "mov v19.16b, v21.16b\n"
    "prfm pldl1keep, [x9, x25]\n"
    "fmla v6.4s, v27.4s, v17.4s\n"
    "prfm pldl1keep, [%[inptr0], x11]\n"
    "fmla v2.4s, v27.4s, v20.4s\n"
    "ldr s24, [%[inptr0], x23]\n"
    "fmla v7.4s, v28.4s, v13.4s\n"
    "prfm pldl1keep, [x10, #64]\n"
    "fmla v6.4s, v28.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x8]\n"
    "fmla v5.4s, v28.4s, v20.4s\n"
    "ldr s26, [x10]\n"
    "fmla v3.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x25]\n"
    "fmla v15.4s, v25.4s, v17.4s\n"
    "prfm pldl1keep, [x9, x11]\n"
    "fmla v0.4s, v25.4s, v20.4s\n"
    "ldr s25, [x26, %[input_col_stride1]]\n"
    "fmla v7.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [%[inptr0], x13]\n"
    "fmla v3.4s, v22.4s, v12.4s\n"
    "prfm pldl1keep, [x27, #64]\n"
    "fmla v6.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [x10, x8]\n"
    "fmla v15.4s, v22.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x25]\n"
    "fmla v2.4s, v22.4s, v17.4s\n"
    "prfm pldl1keep, [x24, x11]\n"
    "fmla v16.4s, v22.4s, v20.4s\n"
    "ldr s22, [x24, x28]\n"
    "fmla v7.4s, v23.4s, v11.4s\n"
    "prfm pldl1keep, [x9, x13]\n"
    "fmla v3.4s, v23.4s, v13.4s\n"
    "prfm pldl1keep, [%[inptr0], x15]\n"
    "fmla v6.4s, v23.4s, v12.4s\n"
    "prfm pldl1keep, [x27, x8]\n"
    "fmla v2.4s, v23.4s, v14.4s\n"
    "prfm pldl1keep, [x10, x25]\n"
    "fmla v5.4s, v23.4s, v17.4s\n"
    "prfm pldl1keep, [x26, x11]\n"
    "fmla v1.4s, v23.4s, v20.4s\n"
    "ldr s23, [x9, x23]\n"
    "fmla v6.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [x24, x13]\n"
    "fmla v5.4s, v24.4s, v14.4s\n"
    "prfm pldl1keep, [x9, x15]\n"
    "fmla v4.4s, v24.4s, v20.4s\n"
    "ldr s24, [%[inptr0], x12]\n"
    "fmla v15.4s, v26.4s, v10.4s\n"
    "prfm pldl1keep, [x27, x25]\n"
    "fmla v0.4s, v26.4s, v17.4s\n"
    "ldr s29, [x27]\n"
    "fmla v3.4s, v25.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x11]\n"
    "fmla v15.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x26, x13]\n"
    "fmla v2.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x15]\n"
    "fmla v0.4s, v25.4s, v14.4s\n"
    "prfm pldl1keep, [x27, x11]\n"
    "fmla v16.4s, v25.4s, v17.4s\n"
    "prfm pldl1keep, [x10, x13]\n"
    "fmla v18.4s, v25.4s, v20.4s\n"
    "ldr s26, [x10, %[input_col_stride1]]\n"
    "fmla v7.4s, v22.4s, v8.4s\n"
    "prfm pldl1keep, [x26, x15]\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "prfm pldl1keep, [x27, x13]\n"
    "fmla v6.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x15]\n"
    "fmla v15.4s, v22.4s, v13.4s\n"
    "prfm pldl1keep, [x27, x15]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v5.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v16.4s, v22.4s, v14.4s\n"
    "subs x21, x21, #1\n"
    "fmla v1.4s, v22.4s, v17.4s\n"
    "fmla v19.4s, v22.4s, v20.4s\n"
    "mov v22.16b, v21.16b\n"
    "fmla v6.4s, v23.4s, v11.4s\n"
    "fmla v2.4s, v23.4s, v13.4s\n"
    "fmla v5.4s, v23.4s, v12.4s\n"
    "fmla v1.4s, v23.4s, v14.4s\n"
    "fmla v4.4s, v23.4s, v17.4s\n"
    "fmla v22.4s, v23.4s, v20.4s\n"
    "ldr s27, [x26, x28]\n"
    "fmla v5.4s, v24.4s, v13.4s\n"
    "fmla v0.4s, v29.4s, v10.4s\n"
    "mov v23.16b, v21.16b\n"
    "fmla v4.4s, v24.4s, v14.4s\n"
    "mov v25.16b, v21.16b\n"
    "mov v24.16b, v21.16b\n"
    "fmla v15.4s, v26.4s, v9.4s\n"
    "fmla v0.4s, v26.4s, v12.4s\n"
    "fmla v16.4s, v26.4s, v10.4s\n"
    "fmla v18.4s, v26.4s, v17.4s\n"
    "fmla v3.4s, v27.4s, v8.4s\n"
    "ldr s29, [x24, x23]\n"
    "fmla v15.4s, v27.4s, v11.4s\n"
    "fmla v2.4s, v27.4s, v9.4s\n"
    "fmla v0.4s, v27.4s, v13.4s\n"
    "fmla v16.4s, v27.4s, v12.4s\n"
    "fmla v1.4s, v27.4s, v10.4s\n"
    "fmla v18.4s, v27.4s, v14.4s\n"
    "fmla v19.4s, v27.4s, v17.4s\n"
    "fmla v23.4s, v27.4s, v20.4s\n"
    "fmla v6.4s, v29.4s, v8.4s\n"
    "ldr s28, [x9, x12]\n"
    "fmla v2.4s, v29.4s, v11.4s\n"
    "fmla v5.4s, v29.4s, v9.4s\n"
    "fmla v16.4s, v29.4s, v13.4s\n"
    "fmla v1.4s, v29.4s, v12.4s\n"
    "fmla v4.4s, v29.4s, v10.4s\n"
    "fmla v19.4s, v29.4s, v14.4s\n"
    "fmla v22.4s, v29.4s, v17.4s\n"
    "fmla v25.4s, v29.4s, v20.4s\n"
    "fmla v5.4s, v28.4s, v11.4s\n"
    "ldr s21, [%[inptr0], x14]\n"
    "fmla v1.4s, v28.4s, v13.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v4.4s, v28.4s, v12.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v22.4s, v28.4s, v14.4s\n"
    "ldr s26, [x27, %[input_col_stride1]]\n"
    "fmla v0.4s, v26.4s, v9.4s\n"
    "prfm pldl1keep, [%[inptr0], x8]\n"
    "fmla v4.4s, v21.4s, v13.4s\n"
    "ldr s21, [x10, x28]\n"
    "fmla v18.4s, v26.4s, v10.4s\n"
    "ldr s29, [x26, x23]\n"
    "fmla v15.4s, v21.4s, v8.4s\n"
    "prfm pldl1keep, [%[inptr0], x25]\n"
    "fmla v0.4s, v21.4s, v11.4s\n"
    "fmla v16.4s, v21.4s, v9.4s\n"
    "fmla v18.4s, v21.4s, v12.4s\n"
    "fmla v19.4s, v21.4s, v10.4s\n"
    "fmla v23.4s, v21.4s, v17.4s\n"
    "ldr s21, [x24, x12]\n"
    "fmla v2.4s, v29.4s, v8.4s\n"
    "fmla v16.4s, v29.4s, v11.4s\n"
    "fmla v1.4s, v29.4s, v9.4s\n"
    "fmla v18.4s, v29.4s, v13.4s\n"
    "fmla v19.4s, v29.4s, v12.4s\n"
    "fmla v22.4s, v29.4s, v10.4s\n"
    "fmla v23.4s, v29.4s, v14.4s\n"
    "fmla v25.4s, v29.4s, v17.4s\n"
    "fmla v24.4s, v29.4s, v20.4s\n"
    "ldr s28, [x9, x14]\n"
    "fmla v5.4s, v21.4s, v8.4s\n"
    "ldr s27, [x27, x28]\n"
    "fmla v1.4s, v21.4s, v11.4s\n"
    "add x9, x9, #4\n"
    "fmla v4.4s, v21.4s, v9.4s\n"
    "prfm pldl1keep, [x9, #64]\n"
    "fmla v19.4s, v21.4s, v13.4s\n"
    "prfm pldl1keep, [x9, x8]\n"
    "fmla v22.4s, v21.4s, v12.4s\n"
    "fmla v25.4s, v21.4s, v14.4s\n"
    "fmla v4.4s, v28.4s, v11.4s\n"
    "ldr s20, [x10, x23]\n"
    "fmla v0.4s, v27.4s, v8.4s\n"
    "fmla v18.4s, v27.4s, v9.4s\n"
    "fmla v22.4s, v28.4s, v13.4s\n"
    "ldr s26, [x26, x12]\n"
    "fmla v23.4s, v27.4s, v10.4s\n"
    "ldr s21, [x24, x14]\n"
    "fmla v16.4s, v20.4s, v8.4s\n"
    "add x24, x24, #4\n"
    "fmla v18.4s, v20.4s, v11.4s\n"
    "prfm pldl1keep, [x24, #64]\n"
    "fmla v19.4s, v20.4s, v9.4s\n"
    "prfm pldl1keep, [x24, x8]\n"
    "fmla v23.4s, v20.4s, v12.4s\n"
    "fmla v25.4s, v20.4s, v10.4s\n"
    "fmla v24.4s, v20.4s, v17.4s\n"
    "ldr s28, [x27, x23]\n"
    "fmla v1.4s, v26.4s, v8.4s\n"
    "ldr s20, [x10, x12]\n"
    "fmla v19.4s, v26.4s, v11.4s\n"
    "fmla v22.4s, v26.4s, v9.4s\n"
    "fmla v23.4s, v26.4s, v13.4s\n"
    "fmla v25.4s, v26.4s, v12.4s\n"
    "fmla v24.4s, v26.4s, v14.4s\n"
    "ldr s17, [x26, x14]\n"
    "fmla v4.4s, v21.4s, v8.4s\n"
    "ldr s26, [x27, x12]\n"
    "fmla v22.4s, v21.4s, v11.4s\n"
    "add x26, x26, #4\n"
    "fmla v25.4s, v21.4s, v13.4s\n"
    "ldr s27, [x10, x14]\n"
    "fmla v18.4s, v28.4s, v8.4s\n"
    "prfm pldl1keep, [x26, #64]\n"
    "fmla v23.4s, v28.4s, v9.4s\n"
    "add x10, x10, #4\n"
    "fmla v24.4s, v28.4s, v10.4s\n"
    "ldr s28, [x27, x14]\n"
    "fmla v19.4s, v20.4s, v8.4s\n"
    "ldr s21, [%[wbptr]]\n"
    "fmla v23.4s, v20.4s, v11.4s\n"
    "add x27, x27, #4\n"
    "fmla v25.4s, v20.4s, v9.4s\n"
    "fmla v24.4s, v20.4s, v12.4s\n"
    "fmla v22.4s, v17.4s, v8.4s\n"
    "ldr s20, [%[wbptr], #4]\n"
    "fmla v23.4s, v26.4s, v8.4s\n"
    "ldr s14, [%[wbptr], #8]\n"
    "fmla v24.4s, v17.4s, v13.4s\n"
    "movi v29.16b, #0\n"
    "fmla v25.4s, v17.4s, v11.4s\n"
    "ldr s17, [%[wbptr], #16]\n"
    "fmax v7.4s, v7.4s, v29.4s\n"
    "fmax v6.4s, v6.4s, v29.4s\n"
    "fmla v24.4s, v26.4s, v9.4s\n"
    "ldr s13, [%[wbptr], #12]\n"
    "str s7, [%[outptr0]]\n"
    "fmla v25.4s, v27.4s, v8.4s\n"
    "str s6, [%[outptr0], %[output_col_stride1]]\n"
    "fmax v5.4s, v5.4s, v29.4s\n"
    "fmla v24.4s, v27.4s, v11.4s\n"
    "ldr s12, [%[wbptr], #20]\n"
    "str s5, [%[outptr0], x19]\n"
    "fmax v4.4s, v4.4s, v29.4s\n"
    "fmax v3.4s, v3.4s, v29.4s\n"
    "ldr s10, [%[wbptr], #28]\n"
    "str s4, [%[outptr0], x20]\n"
    "fmla v24.4s, v28.4s, v8.4s\n"
    "str s3, [x16]\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "ldr s11, [%[wbptr], #24]\n"
    "str s2, [x16, %[output_col_stride1]]\n"
    "fmax v22.4s, v22.4s, v29.4s\n"
    "str s1, [x16, x19]\n"
    "fmax v15.4s, v15.4s, v29.4s\n"
    "str s22, [x16, x20]\n"
    "fmax v16.4s, v16.4s, v29.4s\n"
    "str s15, [x17]\n"
    "fmax v19.4s, v19.4s, v29.4s\n"
    "str s16, [x17, %[output_col_stride1]]\n"
    "fmax v25.4s, v25.4s, v29.4s\n"
    "str s19, [x17, x19]\n"
    "fmax v0.4s, v0.4s, v29.4s\n"
    "str s25, [x17, x20]\n"
    "fmax v18.4s, v18.4s, v29.4s\n"
    "str s0, [x7]\n"
    "fmax v23.4s, v23.4s, v29.4s\n"
    "str s18, [x7, %[output_col_stride1]]\n"
    "fmax v24.4s, v24.4s, v29.4s\n"
    "str s23, [x7, x19]\n"
    "mov v7.16b, v21.16b\n"
    "str s24, [x7, x20]\n"
    "mov v3.16b, v21.16b\n"
    "mov v6.16b, v21.16b\n"
    "ldr s9, [%[wbptr], #32]\n"
    "mov v15.16b, v21.16b\n"
    "ldr s8, [%[wbptr], #36]\n"
    "mov v2.16b, v21.16b\n"
    "ldr s22, [%[inptr0]]\n"
    "mov v5.16b, v21.16b\n"
    "ldr s19, [x9]\n"
    "mov v0.16b, v21.16b\n"
    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
    "mov v16.16b, v21.16b\n"
    "ldr s18, [x24]\n"
    "mov v1.16b, v21.16b\n"
    "ldr s27, [x9, %[input_col_stride1]]\n"
    "mov v4.16b, v21.16b\n"
    "ldr s28, [%[inptr0], x28]\n"
    "fmla v7.4s, v22.4s, v20.4s\n"
    "ldr s25, [x26]\n"
    "fmla v3.4s, v19.4s, v20.4s\n"
    "ldr s22, [x24, %[input_col_stride1]]\n"
    "fmla v6.4s, v23.4s, v20.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmla v7.4s, v19.4s, v17.4s\n"
    "add x16, x16, #4\n"
    "fmla v3.4s, v18.4s, v17.4s\n"
    "add x17, x17, #4\n"
    "fmla v15.4s, v18.4s, v20.4s\n"
    "add x7, x7, #4\n"
    "fmla v7.4s, v23.4s, v14.4s\n"
    "fmla v3.4s, v27.4s, v14.4s\n"
    "fmla v7.4s, v18.4s, v10.4s\n"
    "fmla v7.4s, v27.4s, v12.4s\n"
    "bne 5b\n"
    "6:\n"
    "mov v18.16b, v21.16b\n"
    "ldr s23, [x9, x28]\n"
    "mov v19.16b, v21.16b\n"
    "prfm pldl1keep, [x9, x25]\n"
    "fmla v6.4s, v27.4s, v17.4s\n"
    "prfm pldl1keep, [%[inptr0], x11]\n"
    "fmla v2.4s, v27.4s, v20.4s\n"
    "ldr s24, [%[inptr0], x23]\n"
    "fmla v7.4s, v28.4s, v13.4s\n"
    "prfm pldl1keep, [x10, #64]\n"
    "fmla v6.4s, v28.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x8]\n"
    "fmla v5.4s, v28.4s, v20.4s\n"
    "ldr s26, [x10]\n"
    "fmla v3.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x25]\n"
    "fmla v15.4s, v25.4s, v17.4s\n"
    "prfm pldl1keep, [x9, x11]\n"
    "fmla v0.4s, v25.4s, v20.4s\n"
    "ldr s25, [x26, %[input_col_stride1]]\n"
    "fmla v7.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [%[inptr0], x13]\n"
    "fmla v3.4s, v22.4s, v12.4s\n"
    "prfm pldl1keep, [x27, #64]\n"
    "fmla v6.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [x10, x8]\n"
    "fmla v15.4s, v22.4s, v14.4s\n"
    "prfm pldl1keep, [x26, x25]\n"
    "fmla v2.4s, v22.4s, v17.4s\n"
    "prfm pldl1keep, [x24, x11]\n"
    "fmla v16.4s, v22.4s, v20.4s\n"
    "ldr s22, [x24, x28]\n"
    "fmla v7.4s, v23.4s, v11.4s\n"
    "prfm pldl1keep, [x9, x13]\n"
    "fmla v3.4s, v23.4s, v13.4s\n"
    "prfm pldl1keep, [%[inptr0], x15]\n"
    "fmla v6.4s, v23.4s, v12.4s\n"
    "prfm pldl1keep, [x27, x8]\n"
    "fmla v2.4s, v23.4s, v14.4s\n"
    "prfm pldl1keep, [x10, x25]\n"
    "fmla v5.4s, v23.4s, v17.4s\n"
    "prfm pldl1keep, [x26, x11]\n"
    "fmla v1.4s, v23.4s, v20.4s\n"
    "ldr s23, [x9, x23]\n"
    "fmla v6.4s, v24.4s, v13.4s\n"
    "prfm pldl1keep, [x24, x13]\n"
    "fmla v5.4s, v24.4s, v14.4s\n"
    "prfm pldl1keep, [x9, x15]\n"
    "fmla v4.4s, v24.4s, v20.4s\n"
    "ldr s24, [%[inptr0], x12]\n"
    "fmla v15.4s, v26.4s, v10.4s\n"
    "prfm pldl1keep, [x27, x25]\n"
    "fmla v0.4s, v26.4s, v17.4s\n"
    "ldr s29, [x27]\n"
    "fmla v3.4s, v25.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x11]\n"
    "fmla v15.4s, v25.4s, v12.4s\n"
    "prfm pldl1keep, [x26, x13]\n"
    "fmla v2.4s, v25.4s, v10.4s\n"
    "prfm pldl1keep, [x24, x15]\n"
    "fmla v0.4s, v25.4s, v14.4s\n"
    "prfm pldl1keep, [x27, x11]\n"
    "fmla v16.4s, v25.4s, v17.4s\n"
    "prfm pldl1keep, [x10, x13]\n"
    "fmla v18.4s, v25.4s, v20.4s\n"
    "ldr s26, [x10, %[input_col_stride1]]\n"
    "fmla v7.4s, v22.4s, v8.4s\n"
    "prfm pldl1keep, [x26, x15]\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "prfm pldl1keep, [x27, x13]\n"
    "fmla v6.4s, v22.4s, v9.4s\n"
    "prfm pldl1keep, [x10, x15]\n"
    "fmla v15.4s, v22.4s, v13.4s\n"
    "prfm pldl1keep, [x27, x15]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v5.4s, v22.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v16.4s, v22.4s, v14.4s\n"
    "fmla v1.4s, v22.4s, v17.4s\n"
    "fmla v19.4s, v22.4s, v20.4s\n"
    "ldr s27, [x26, x28]\n"
    "fmla v6.4s, v23.4s, v11.4s\n"
    "fmla v2.4s, v23.4s, v13.4s\n"
    "fmla v5.4s, v23.4s, v12.4s\n"
    "fmla v1.4s, v23.4s, v14.4s\n"
    "fmla v4.4s, v23.4s, v17.4s\n"
    "fmla v0.4s, v29.4s, v10.4s\n"
    "mov v22.16b, v21.16b\n"
    "fmla v15.4s, v26.4s, v9.4s\n"
    "fmla v5.4s, v24.4s, v13.4s\n"
    "fmla v16.4s, v26.4s, v10.4s\n"
    "fmla v22.4s, v23.4s, v20.4s\n"
    "ldr s29, [x24, x23]\n"
    "fmla v4.4s, v24.4s, v14.4s\n"
    "ldr s28, [x9, x12]\n"
    "fmla v0.4s, v26.4s, v12.4s\n"
    "fmla v18.4s, v26.4s, v17.4s\n"
    "mov v23.16b, v21.16b\n"
    "fmla v3.4s, v27.4s, v8.4s\n"
    "fmla v15.4s, v27.4s, v11.4s\n"
    "fmla v2.4s, v27.4s, v9.4s\n"
    "fmla v0.4s, v27.4s, v13.4s\n"
    "fmla v16.4s, v27.4s, v12.4s\n"
    "fmla v1.4s, v27.4s, v10.4s\n"
    "fmla v18.4s, v27.4s, v14.4s\n"
    "fmla v19.4s, v27.4s, v17.4s\n"
    "fmla v23.4s, v27.4s, v20.4s\n"
    "mov v25.16b, v21.16b\n"
    "mov v24.16b, v21.16b\n"
    "fmla v6.4s, v29.4s, v8.4s\n"
    "fmla v2.4s, v29.4s, v11.4s\n"
    "fmla v5.4s, v29.4s, v9.4s\n"
    "fmla v16.4s, v29.4s, v13.4s\n"
    "fmla v1.4s, v29.4s, v12.4s\n"
    "fmla v4.4s, v29.4s, v10.4s\n"
    "fmla v19.4s, v29.4s, v14.4s\n"
    "fmla v22.4s, v29.4s, v17.4s\n"
    "fmla v25.4s, v29.4s, v20.4s\n"
    "ldr s21, [%[inptr0], x14]\n"
    "fmla v5.4s, v28.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v1.4s, v28.4s, v13.4s\n"
    "fmla v4.4s, v28.4s, v12.4s\n"
    "fmla v22.4s, v28.4s, v14.4s\n"
    "ldr s26, [x27, %[input_col_stride1]]\n"
    "fmla v0.4s, v26.4s, v9.4s\n"
    "fmla v18.4s, v26.4s, v10.4s\n"
    "fmla v4.4s, v21.4s, v13.4s\n"
    "ldr s21, [x10, x28]\n"
    "fmla v15.4s, v21.4s, v8.4s\n"
    "ldr s29, [x26, x23]\n"
    "fmla v0.4s, v21.4s, v11.4s\n"
    "fmla v16.4s, v21.4s, v9.4s\n"
    "fmla v18.4s, v21.4s, v12.4s\n"
    "fmla v19.4s, v21.4s, v10.4s\n"
    "fmla v23.4s, v21.4s, v17.4s\n"
    "ldr s21, [x24, x12]\n"
    "fmla v2.4s, v29.4s, v8.4s\n"
    "fmla v16.4s, v29.4s, v11.4s\n"
    "fmla v1.4s, v29.4s, v9.4s\n"
    "fmla v18.4s, v29.4s, v13.4s\n"
    "fmla v19.4s, v29.4s, v12.4s\n"
    "fmla v22.4s, v29.4s, v10.4s\n"
    "fmla v23.4s, v29.4s, v14.4s\n"
    "fmla v25.4s, v29.4s, v17.4s\n"
    "fmla v24.4s, v29.4s, v20.4s\n"
    "ldr s28, [x9, x14]\n"
    "fmla v5.4s, v21.4s, v8.4s\n"
    "ldr s27, [x27, x28]\n"
    "fmla v1.4s, v21.4s, v11.4s\n"
    "add x9, x9, #4\n"
    "fmla v4.4s, v21.4s, v9.4s\n"
    "fmla v19.4s, v21.4s, v13.4s\n"
    "fmla v22.4s, v21.4s, v12.4s\n"
    "fmla v25.4s, v21.4s, v14.4s\n"
    "fmla v0.4s, v27.4s, v8.4s\n"
    "ldr s20, [x10, x23]\n"
    "fmla v4.4s, v28.4s, v11.4s\n"
    "fmla v18.4s, v27.4s, v9.4s\n"
    "fmla v22.4s, v28.4s, v13.4s\n"
    "ldr s26, [x26, x12]\n"
    "fmla v23.4s, v27.4s, v10.4s\n"
    "ldr s21, [x24, x14]\n"
    "fmla v16.4s, v20.4s, v8.4s\n"
    "add x24, x24, #4\n"
    "fmla v18.4s, v20.4s, v11.4s\n"
    "fmla v19.4s, v20.4s, v9.4s\n"
    "fmla v23.4s, v20.4s, v12.4s\n"
    "fmla v25.4s, v20.4s, v10.4s\n"
    "fmla v24.4s, v20.4s, v17.4s\n"
    "ldr s28, [x27, x23]\n"
    "fmla v1.4s, v26.4s, v8.4s\n"
    "ldr s20, [x10, x12]\n"
    "fmla v19.4s, v26.4s, v11.4s\n"
    "fmla v22.4s, v26.4s, v9.4s\n"
    "fmla v23.4s, v26.4s, v13.4s\n"
    "fmla v25.4s, v26.4s, v12.4s\n"
    "fmla v24.4s, v26.4s, v14.4s\n"
    "ldr s17, [x26, x14]\n"
    "fmla v4.4s, v21.4s, v8.4s\n"
    "ldr s26, [x27, x12]\n"
    "fmla v22.4s, v21.4s, v11.4s\n"
    "add x26, x26, #4\n"
    "fmla v25.4s, v21.4s, v13.4s\n"
    "ldr s27, [x10, x14]\n"
    "fmla v18.4s, v28.4s, v8.4s\n"
    "add x10, x10, #4\n"
    "fmla v23.4s, v28.4s, v9.4s\n"
    "fmla v24.4s, v28.4s, v10.4s\n"
    "fmla v19.4s, v20.4s, v8.4s\n"
    "ldr s28, [x27, x14]\n"
    "fmla v25.4s, v20.4s, v9.4s\n"
    "add x27, x27, #4\n"
    "fmla v23.4s, v20.4s, v11.4s\n"
    "fmla v24.4s, v20.4s, v12.4s\n"
    "fmla v22.4s, v17.4s, v8.4s\n"
    "movi v29.16b, #0\n"
    "fmla v25.4s, v17.4s, v11.4s\n"
    "fmla v24.4s, v17.4s, v13.4s\n"
    "fmla v23.4s, v26.4s, v8.4s\n"
    "fmax v7.4s, v7.4s, v29.4s\n"
    "fmla v25.4s, v27.4s, v8.4s\n"
    "fmax v6.4s, v6.4s, v29.4s\n"
    "str s7, [%[outptr0]]\n"
    "fmla v24.4s, v26.4s, v9.4s\n"
    "str s6, [%[outptr0], %[output_col_stride1]]\n"
    "fmax v5.4s, v5.4s, v29.4s\n"
    "fmax v4.4s, v4.4s, v29.4s\n"
    "fmax v3.4s, v3.4s, v29.4s\n"
    "str s5, [%[outptr0], x19]\n"
    "fmla v24.4s, v27.4s, v11.4s\n"
    "str s4, [%[outptr0], x20]\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "str s3, [x16]\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "str s2, [x16, %[output_col_stride1]]\n"
    "fmla v24.4s, v28.4s, v8.4s\n"
    "str s1, [x16, x19]\n"
    "fmax v22.4s, v22.4s, v29.4s\n"
    "fmax v15.4s, v15.4s, v29.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "str s22, [x16, x20]\n"
    "fmax v16.4s, v16.4s, v29.4s\n"
    "str s15, [x17]\n"
    "fmax v19.4s, v19.4s, v29.4s\n"
    "str s16, [x17, %[output_col_stride1]]\n"
    "fmax v25.4s, v25.4s, v29.4s\n"
    "str s19, [x17, x19]\n"
    "fmax v0.4s, v0.4s, v29.4s\n"
    "str s25, [x17, x20]\n"
    "fmax v18.4s, v18.4s, v29.4s\n"
    "str s0, [x7]\n"
    "fmax v23.4s, v23.4s, v29.4s\n"
    "str s18, [x7, %[output_col_stride1]]\n"
    "fmax v24.4s, v24.4s, v29.4s\n"
    "str s23, [x7, x19]\n"
    "add x16, x16, #4\n"
    "str s24, [x7, x20]\n"
    "add x17, x17, #4\n"
    "add x7, x7, #4\n"
    "7:\n"
    : [wbptr] "+r" (weight_bias_ptr), [outptr0] "+r" (output), [inptr0] "+r" (input)
    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float))
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
  );
}

template <>
template <>
void Conv::execute_tile<ActivationFunction::ReLU>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *inptrs[6][6],
  float *outptrs[4][4]
)
{
  __asm __volatile(
    "mov x27, xzr\n"
    "mov x28, xzr\n"
    "and x19, %[n_channels], #3\n"
    "lsr x26, %[n_channels], #2\n"
    "cbz x26, 4f\n"
    "1:\n"
    "ldr q25, [%[wbptr]]\n"
    "ldr x25, [%[inptrs], 0]\n"
    "mov v2.16b, v25.16b\n"
    "ldr q22, [%[wbptr], #16]\n"
    "mov v16.16b, v25.16b\n"
    "ldr q9, [%[wbptr], #32]\n"
    "mov v18.16b, v25.16b\n"
    "ldr q8, [%[wbptr], #48]\n"
    "mov v13.16b, v25.16b\n"
    "ldr q19, [%[wbptr], #64]\n"
    "mov v0.16b, v25.16b\n"
    "ldr q7, [%[wbptr], #80]\n"
    "mov v17.16b, v25.16b\n"
    "ldr q6, [%[wbptr], #96]\n"
    "mov v14.16b, v25.16b\n"
    "ldr q5, [%[wbptr], #112]\n"
    "mov v12.16b, v25.16b\n"
    "ldr q4, [%[wbptr], #128]\n"
    "mov v15.16b, v25.16b\n"
    "ldr q3, [%[wbptr], #144]\n"
    "ldr q27, [x25, x27]\n"
    "ldr x17, [%[inptrs], 48]\n"
    "fmla v2.4s, v27.4s, v22.4s\n"
    "ldr x25, [%[inptrs], 8]\n"
    "ldr q26, [x17, x27]\n"
    "ldr x24, [%[inptrs], 96]\n"
    "fmla v16.4s, v26.4s, v22.4s\n"
    "ldr q31, [x25, x27]\n"
    "ldr q28, [x24, x27]\n"
    "ldr x17, [%[inptrs], 56]\n"
    "fmla v2.4s, v26.4s, v19.4s\n"
    "ldr x25, [%[inptrs], 16]\n"
    "ldr q29, [x17, x27]\n"
    "ldr x7, [%[inptrs], 144]\n"
    "ldr x24, [%[inptrs], 104]\n"
    "subs x26, x26, #1\n"
    "ldr q30, [x25, x27]\n"
    "ldr q27, [x7, x27]\n"
    "ldr q21, [x24, x27]\n"
    "fmla v2.4s, v31.4s, v9.4s\n"
    "beq 3f\n"
    "2:\n"
    "mov v1.16b, v25.16b\n"
    "ldr x17, [%[inptrs], 64]\n"
    "mov v10.16b, v25.16b\n"
    "ldr x25, [%[inptrs], 24]\n"
    "fmla v18.4s, v31.4s, v22.4s\n"
    "ldr q23, [x17, x27]\n"
    "fmla v2.4s, v28.4s, v5.4s\n"
    "ldr x15, [%[inptrs], 192]\n"
    "fmla v16.4s, v28.4s, v19.4s\n"
    "ldr x7, [%[inptrs], 152]\n"
    "fmla v13.4s, v28.4s, v22.4s\n"
    "ldr q26, [x25, x27]\n"
    "fmla v18.4s, v29.4s, v19.4s\n"
    "ldr x24, [%[inptrs], 112]\n"
    "fmla v2.4s, v29.4s, v7.4s\n"
    "ldr x17, [%[inptrs], 72]\n"
    "fmla v16.4s, v29.4s, v9.4s\n"
    "ldr x25, [%[inptrs], 32]\n"
    "fmla v0.4s, v29.4s, v22.4s\n"
    "ldr q28, [x15, x27]\n"
    "fmla v18.4s, v30.4s, v9.4s\n"
    "ldr x16, [%[inptrs], 240]\n"
    "fmla v2.4s, v30.4s, v8.4s\n"
    "ldr x15, [%[inptrs], 200]\n"
    "fmla v17.4s, v30.4s, v22.4s\n"
    "ldr q29, [x7, x27]\n"
    "fmla v16.4s, v27.4s, v5.4s\n"
    "ldr x7, [%[inptrs], 160]\n"
    "fmla v13.4s, v27.4s, v19.4s\n"
    "ldr x20, [%[outptrs], 0]\n"
    "fmla v14.4s, v27.4s, v22.4s\n"
    "ldr q20, [x24, x27]\n"
    "fmla v2.4s, v21.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 120]\n"
    "fmla v16.4s, v21.4s, v7.4s\n"
    "ldr x21, [%[outptrs], 32]\n"
    "fmla v18.4s, v21.4s, v5.4s\n"
    "ldr x22, [%[outptrs], 64]\n"
    "fmla v13.4s, v21.4s, v9.4s\n"
    "ldr x23, [%[outptrs], 96]\n"
    "fmla v0.4s, v21.4s, v19.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v12.4s, v21.4s, v22.4s\n"
    "ldr q24, [x17, x27]\n"
    "fmla v2.4s, v23.4s, v6.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v16.4s, v23.4s, v8.4s\n"
    "ldr x17, [%[inptrs], 80]\n"
    "fmla v18.4s, v23.4s, v7.4s\n"
    "subs x26, x26, #1\n"
    "fmla v0.4s, v23.4s, v9.4s\n"
    "fmla v17.4s, v23.4s, v19.4s\n"
    "fmla v15.4s, v23.4s, v22.4s\n"
    "ldr q23, [x25, x27]\n"
    "fmla v1.4s, v26.4s, v22.4s\n"
    "ldr x25, [%[inptrs], 40]\n"
    "fmla v18.4s, v26.4s, v8.4s\n"
    "fmla v13.4s, v28.4s, v5.4s\n"
    "fmla v17.4s, v26.4s, v9.4s\n"
    "ldr q30, [x16, x27]\n"
    "fmla v14.4s, v28.4s, v19.4s\n"
    "ldr q26, [x15, x27]\n"
    "fmla v16.4s, v29.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 248]\n"
    "fmla v13.4s, v29.4s, v7.4s\n"
    "ldr x15, [%[inptrs], 208]\n"
    "fmla v0.4s, v29.4s, v5.4s\n"
    "fmla v12.4s, v29.4s, v19.4s\n"
    "fmla v14.4s, v29.4s, v9.4s\n"
    "fmla v10.4s, v29.4s, v22.4s\n"
    "mov v11.16b, v25.16b\n"
    "fmla v2.4s, v20.4s, v3.4s\n"
    "fmla v16.4s, v20.4s, v6.4s\n"
    "fmla v18.4s, v20.4s, v4.4s\n"
    "fmla v13.4s, v20.4s, v8.4s\n"
    "fmla v0.4s, v20.4s, v7.4s\n"
    "fmla v17.4s, v20.4s, v5.4s\n"
    "fmla v12.4s, v20.4s, v9.4s\n"
    "fmla v15.4s, v20.4s, v19.4s\n"
    "fmla v11.4s, v20.4s, v22.4s\n"
    "mov v21.16b, v25.16b\n"
    "fmla v18.4s, v24.4s, v6.4s\n"
    "fmla v0.4s, v24.4s, v8.4s\n"
    "fmla v1.4s, v24.4s, v19.4s\n"
    "fmla v17.4s, v24.4s, v7.4s\n"
    "fmla v14.4s, v30.4s, v5.4s\n"
    "mov v20.16b, v25.16b\n"
    "fmla v15.4s, v24.4s, v9.4s\n"
    "fmla v21.4s, v24.4s, v22.4s\n"
    "ldr q27, [x7, x27]\n"
    "fmla v1.4s, v23.4s, v9.4s\n"
    "ldr x7, [%[inptrs], 168]\n"
    "fmla v17.4s, v23.4s, v8.4s\n"
    "ldr q30, [x24, x27]\n"
    "fmla v13.4s, v26.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 128]\n"
    "fmla v14.4s, v26.4s, v7.4s\n"
    "fmla v12.4s, v26.4s, v5.4s\n"
    "fmla v10.4s, v26.4s, v19.4s\n"
    "ldr q31, [x17, x27]\n"
    "fmla v16.4s, v27.4s, v3.4s\n"
    "ldr x17, [%[inptrs], 88]\n"
    "fmla v13.4s, v27.4s, v6.4s\n"
    "fmla v0.4s, v27.4s, v4.4s\n"
    "fmla v14.4s, v27.4s, v8.4s\n"
    "fmla v12.4s, v27.4s, v7.4s\n"
    "fmla v15.4s, v27.4s, v5.4s\n"
    "fmla v10.4s, v27.4s, v9.4s\n"
    "fmla v11.4s, v27.4s, v19.4s\n"
    "fmla v20.4s, v27.4s, v22.4s\n"
    "mov v24.16b, v25.16b\n"
    "mov v23.16b, v25.16b\n"
    "fmla v18.4s, v30.4s, v3.4s\n"
    "fmla v0.4s, v30.4s, v6.4s\n"
    "fmla v17.4s, v30.4s, v4.4s\n"
    "fmla v12.4s, v30.4s, v8.4s\n"
    "fmla v15.4s, v30.4s, v7.4s\n"
    "fmla v1.4s, v30.4s, v5.4s\n"
    "fmla v11.4s, v30.4s, v9.4s\n"
    "fmla v21.4s, v30.4s, v19.4s\n"
    "fmla v24.4s, v30.4s, v22.4s\n"
    "ldr q25, [x25, x27]\n"
    "fmla v17.4s, v31.4s, v6.4s\n"
    "ldr x25, [%[inptrs], 0]\n"
    "fmla v15.4s, v31.4s, v8.4s\n"
    "fmla v1.4s, v31.4s, v7.4s\n"
    "fmla v21.4s, v31.4s, v9.4s\n"
    "ldr q26, [x16, x27]\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 256]\n"
    "fmla v10.4s, v26.4s, v5.4s\n"
    "ldr q31, [x15, x27]\n"
    "fmla v1.4s, v25.4s, v8.4s\n"
    "ldr q29, [x7, x27]\n"
    "fmla v13.4s, v31.4s, v3.4s\n"
    "ldr x15, [%[inptrs], 216]\n"
    "fmla v14.4s, v31.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 176]\n"
    "fmla v12.4s, v31.4s, v4.4s\n"
    "fmla v10.4s, v31.4s, v7.4s\n"
    "fmla v11.4s, v31.4s, v5.4s\n"
    "fmla v20.4s, v31.4s, v19.4s\n"
    "fmla v0.4s, v29.4s, v3.4s\n"
    "ldr q28, [x24, x27]\n"
    "fmla v15.4s, v29.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 136]\n"
    "fmla v12.4s, v29.4s, v6.4s\n"
    "fmla v10.4s, v29.4s, v8.4s\n"
    "fmla v11.4s, v29.4s, v7.4s\n"
    "fmla v21.4s, v29.4s, v5.4s\n"
    "fmla v20.4s, v29.4s, v9.4s\n"
    "fmla v24.4s, v29.4s, v19.4s\n"
    "fmla v23.4s, v29.4s, v22.4s\n"
    "ldr q25, [x17, x27]\n"
    "fmla v17.4s, v28.4s, v3.4s\n"
    "ldr q29, [x16, x27]\n"
    "fmla v15.4s, v28.4s, v6.4s\n"
    "ldr x16, [%[inptrs], 264]\n"
    "fmla v1.4s, v28.4s, v4.4s\n"
    "ldr x17, [%[inptrs], 48]\n"
    "fmla v11.4s, v28.4s, v8.4s\n"
    "fmla v21.4s, v28.4s, v7.4s\n"
    "fmla v24.4s, v28.4s, v9.4s\n"
    "ldr q22, [x15, x27]\n"
    "fmla v14.4s, v29.4s, v3.4s\n"
    "ldr x15, [%[inptrs], 224]\n"
    "fmla v1.4s, v25.4s, v6.4s\n"
    "fmla v10.4s, v29.4s, v4.4s\n"
    "fmla v21.4s, v25.4s, v8.4s\n"
    "ldr q27, [x7, x27]\n"
    "fmla v20.4s, v29.4s, v5.4s\n"
    "ldr q26, [x24, x27]\n"
    "fmla v12.4s, v22.4s, v3.4s\n"
    "ldr x7, [%[inptrs], 184]\n"
    "fmla v10.4s, v22.4s, v6.4s\n"
    "ldr x24, [%[inptrs], 96]\n"
    "fmla v11.4s, v22.4s, v4.4s\n"
    "fmla v24.4s, v22.4s, v5.4s\n"
    "fmla v20.4s, v22.4s, v7.4s\n"
    "fmla v23.4s, v22.4s, v19.4s\n"
    "fmla v15.4s, v27.4s, v3.4s\n"
    "ldr q25, [x16, x27]\n"
    "fmla v21.4s, v27.4s, v4.4s\n"
    "ldr q31, [x15, x27]\n"
    "fmla v11.4s, v27.4s, v6.4s\n"
    "ldr x16, [%[inptrs], 272]\n"
    "fmla v20.4s, v27.4s, v8.4s\n"
    "ldr x15, [%[inptrs], 232]\n"
    "fmla v24.4s, v27.4s, v7.4s\n"
    "fmla v23.4s, v27.4s, v9.4s\n"
    "fmla v1.4s, v26.4s, v3.4s\n"
    "ldr q22, [x7, x27]\n"
    "fmla v21.4s, v26.4s, v6.4s\n"
    "ldr q19, [x16, x27]\n"
    "fmla v10.4s, v25.4s, v3.4s\n"
    "ldr x16, [%[inptrs], 280]\n"
    "fmla v24.4s, v26.4s, v8.4s\n"
    "ldr q28, [x15, x27]\n"
    "fmla v20.4s, v25.4s, v4.4s\n"
    "ldr x7, [%[inptrs], 144]\n"
    "fmla v23.4s, v25.4s, v5.4s\n"
    "ldr q30, [x16, x27]\n"
    "fmla v11.4s, v31.4s, v3.4s\n"
    "add x27, x27, #16\n"
    "fmla v24.4s, v31.4s, v4.4s\n"
    "ldr q27, [x25, x27]\n"
    "fmla v20.4s, v31.4s, v6.4s\n"
    "ldr x25, [%[inptrs], 8]\n"
    "fmla v23.4s, v31.4s, v7.4s\n"
    "movi v29.16b, #0\n"
    "fmla v21.4s, v22.4s, v3.4s\n"
    "ldr q26, [x17, x27]\n"
    "fmla v24.4s, v22.4s, v6.4s\n"
    "ldr x17, [%[inptrs], 56]\n"
    "fmla v20.4s, v19.4s, v3.4s\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "fmla v23.4s, v22.4s, v8.4s\n"
    "ldr q25, [%[wbptr]]\n"
    "fmax v18.4s, v18.4s, v29.4s\n"
    "ldr q22, [%[wbptr], #16]\n"
    "str q2, [x20, x28]\n"
    "fmla v24.4s, v28.4s, v3.4s\n"
    "fmax v17.4s, v17.4s, v29.4s\n"
    "ldr q9, [%[wbptr], #32]\n"
    "fmla v23.4s, v19.4s, v4.4s\n"
    "ldr q8, [%[wbptr], #48]\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "ldr q19, [%[wbptr], #64]\n"
    "fmax v16.4s, v16.4s, v29.4s\n"
    "ldr x20, [%[outptrs], 8]\n"
    "fmax v0.4s, v0.4s, v29.4s\n"
    "fmax v15.4s, v15.4s, v29.4s\n"
    "str q18, [x20, x28]\n"
    "fmla v23.4s, v28.4s, v6.4s\n"
    "str q16, [x21, x28]\n"
    "fmax v21.4s, v21.4s, v29.4s\n"
    "fmax v13.4s, v13.4s, v29.4s\n"
    "ldr q7, [%[wbptr], #80]\n"
    "fmax v12.4s, v12.4s, v29.4s\n"
    "ldr q5, [%[wbptr], #112]\n"
    "fmla v23.4s, v30.4s, v3.4s\n"
    "ldr q6, [%[wbptr], #96]\n"
    "str q13, [x22, x28]\n"
    "fmax v11.4s, v11.4s, v29.4s\n"
    "fmax v24.4s, v24.4s, v29.4s\n"
    "ldr q4, [%[wbptr], #128]\n"
    "fmax v14.4s, v14.4s, v29.4s\n"
    "ldr q31, [x25, x27]\n"
    "fmax v10.4s, v10.4s, v29.4s\n"
    "ldr q3, [%[wbptr], #144]\n"
    "fmax v20.4s, v20.4s, v29.4s\n"
    "ldr q28, [x24, x27]\n"
    "str q14, [x23, x28]\n"
    "fmax v23.4s, v23.4s, v29.4s\n"
    "mov v2.16b, v25.16b\n"
    "ldr q29, [x17, x27]\n"
    "ldr x20, [%[outptrs], 16]\n"
    "ldr x21, [%[outptrs], 40]\n"
    "ldr x22, [%[outptrs], 72]\n"
    "ldr x23, [%[outptrs], 104]\n"
    "ldr x25, [%[inptrs], 16]\n"
    "ldr x24, [%[inptrs], 104]\n"
    "str q17, [x20, x28]\n"
    "mov v16.16b, v25.16b\n"
    "str q0, [x21, x28]\n"
    "mov v18.16b, v25.16b\n"
    "str q12, [x22, x28]\n"
    "mov v13.16b, v25.16b\n"
    "str q10, [x23, x28]\n"
    "mov v0.16b, v25.16b\n"
    "fmla v2.4s, v27.4s, v22.4s\n"
    "ldr q30, [x25, x27]\n"
    "fmla v16.4s, v26.4s, v22.4s\n"
    "ldr x20, [%[outptrs], 24]\n"
    "mov v17.16b, v25.16b\n"
    "ldr x21, [%[outptrs], 48]\n"
    "str q1, [x20, x28]\n"
    "mov v14.16b, v25.16b\n"
    "str q15, [x21, x28]\n"
    "mov v12.16b, v25.16b\n"
    "mov v15.16b, v25.16b\n"
    "ldr x21, [%[outptrs], 56]\n"
    "fmla v2.4s, v26.4s, v19.4s\n"
    "ldr q27, [x7, x27]\n"
    "str q21, [x21, x28]\n"
    "ldr x22, [%[outptrs], 80]\n"
    "ldr q21, [x24, x27]\n"
    "ldr x23, [%[outptrs], 112]\n"
    "str q11, [x22, x28]\n"
    "fmla v2.4s, v31.4s, v9.4s\n"
    "str q20, [x23, x28]\n"
    "ldr x22, [%[outptrs], 88]\n"
    "ldr x23, [%[outptrs], 120]\n"
    "str q24, [x22, x28]\n"
    "str q23, [x23, x28]\n"
    "add x28, x28, #16\n"
    "bne 2b\n"
    "3:\n"
    "mov v1.16b, v25.16b\n"
    "ldr x17, [%[inptrs], 64]\n"
    "mov v10.16b, v25.16b\n"
    "ldr x25, [%[inptrs], 24]\n"
    "mov v11.16b, v25.16b\n"
    "ldr x15, [%[inptrs], 192]\n"
    "fmla v18.4s, v31.4s, v22.4s\n"
    "ldr q23, [x17, x27]\n"
    "fmla v2.4s, v28.4s, v5.4s\n"
    "ldr x7, [%[inptrs], 152]\n"
    "fmla v16.4s, v28.4s, v19.4s\n"
    "ldr x24, [%[inptrs], 112]\n"
    "fmla v13.4s, v28.4s, v22.4s\n"
    "ldr q26, [x25, x27]\n"
    "fmla v18.4s, v29.4s, v19.4s\n"
    "ldr x17, [%[inptrs], 72]\n"
    "fmla v2.4s, v29.4s, v7.4s\n"
    "ldr x25, [%[inptrs], 32]\n"
    "fmla v16.4s, v29.4s, v9.4s\n"
    "ldr x16, [%[inptrs], 240]\n"
    "fmla v0.4s, v29.4s, v22.4s\n"
    "ldr q28, [x15, x27]\n"
    "fmla v18.4s, v30.4s, v9.4s\n"
    "ldr x15, [%[inptrs], 200]\n"
    "fmla v2.4s, v30.4s, v8.4s\n"
    "ldr x20, [%[outptrs], 0]\n"
    "fmla v17.4s, v30.4s, v22.4s\n"
    "ldr q29, [x7, x27]\n"
    "fmla v16.4s, v27.4s, v5.4s\n"
    "ldr x7, [%[inptrs], 160]\n"
    "fmla v13.4s, v27.4s, v19.4s\n"
    "ldr x21, [%[outptrs], 32]\n"
    "fmla v14.4s, v27.4s, v22.4s\n"
    "ldr q20, [x24, x27]\n"
    "fmla v2.4s, v21.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 120]\n"
    "fmla v16.4s, v21.4s, v7.4s\n"
    "ldr x22, [%[outptrs], 64]\n"
    "fmla v18.4s, v21.4s, v5.4s\n"
    "ldr x23, [%[outptrs], 96]\n"
    "fmla v13.4s, v21.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v0.4s, v21.4s, v19.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v12.4s, v21.4s, v22.4s\n"
    "ldr q24, [x17, x27]\n"
    "fmla v2.4s, v23.4s, v6.4s\n"
    "ldr x17, [%[inptrs], 80]\n"
    "fmla v16.4s, v23.4s, v8.4s\n"
    "fmla v18.4s, v23.4s, v7.4s\n"
    "fmla v0.4s, v23.4s, v9.4s\n"
    "fmla v17.4s, v23.4s, v19.4s\n"
    "fmla v15.4s, v23.4s, v22.4s\n"
    "ldr q23, [x25, x27]\n"
    "fmla v1.4s, v26.4s, v22.4s\n"
    "ldr x25, [%[inptrs], 40]\n"
    "fmla v18.4s, v26.4s, v8.4s\n"
    "fmla v13.4s, v28.4s, v5.4s\n"
    "fmla v17.4s, v26.4s, v9.4s\n"
    "ldr q30, [x16, x27]\n"
    "fmla v14.4s, v28.4s, v19.4s\n"
    "ldr q26, [x15, x27]\n"
    "fmla v16.4s, v29.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 248]\n"
    "fmla v13.4s, v29.4s, v7.4s\n"
    "ldr x15, [%[inptrs], 208]\n"
    "fmla v0.4s, v29.4s, v5.4s\n"
    "fmla v12.4s, v29.4s, v19.4s\n"
    "fmla v14.4s, v29.4s, v9.4s\n"
    "fmla v10.4s, v29.4s, v22.4s\n"
    "mov v21.16b, v25.16b\n"
    "fmla v2.4s, v20.4s, v3.4s\n"
    "fmla v16.4s, v20.4s, v6.4s\n"
    "fmla v18.4s, v20.4s, v4.4s\n"
    "fmla v13.4s, v20.4s, v8.4s\n"
    "fmla v0.4s, v20.4s, v7.4s\n"
    "fmla v17.4s, v20.4s, v5.4s\n"
    "fmla v12.4s, v20.4s, v9.4s\n"
    "fmla v15.4s, v20.4s, v19.4s\n"
    "fmla v11.4s, v20.4s, v22.4s\n"
    "mov v20.16b, v25.16b\n"
    "fmla v18.4s, v24.4s, v6.4s\n"
    "fmla v0.4s, v24.4s, v8.4s\n"
    "fmla v1.4s, v24.4s, v19.4s\n"
    "fmla v17.4s, v24.4s, v7.4s\n"
    "fmla v21.4s, v24.4s, v22.4s\n"
    "fmla v15.4s, v24.4s, v9.4s\n"
    "ldr q27, [x7, x27]\n"
    "fmla v14.4s, v30.4s, v5.4s\n"
    "ldr q30, [x24, x27]\n"
    "fmla v1.4s, v23.4s, v9.4s\n"
    "ldr x7, [%[inptrs], 168]\n"
    "fmla v17.4s, v23.4s, v8.4s\n"
    "ldr q31, [x17, x27]\n"
    "fmla v13.4s, v26.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 128]\n"
    "fmla v14.4s, v26.4s, v7.4s\n"
    "ldr x17, [%[inptrs], 88]\n"
    "fmla v12.4s, v26.4s, v5.4s\n"
    "fmla v10.4s, v26.4s, v19.4s\n"
    "mov v24.16b, v25.16b\n"
    "mov v23.16b, v25.16b\n"
    "fmla v16.4s, v27.4s, v3.4s\n"
    "fmla v13.4s, v27.4s, v6.4s\n"
    "fmla v0.4s, v27.4s, v4.4s\n"
    "fmla v14.4s, v27.4s, v8.4s\n"
    "fmla v12.4s, v27.4s, v7.4s\n"
    "fmla v15.4s, v27.4s, v5.4s\n"
    "fmla v10.4s, v27.4s, v9.4s\n"
    "fmla v11.4s, v27.4s, v19.4s\n"
    "fmla v20.4s, v27.4s, v22.4s\n"
    "ldr q25, [x25, x27]\n"
    "fmla v18.4s, v30.4s, v3.4s\n"
    "fmla v0.4s, v30.4s, v6.4s\n"
    "fmla v17.4s, v30.4s, v4.4s\n"
    "fmla v12.4s, v30.4s, v8.4s\n"
    "fmla v15.4s, v30.4s, v7.4s\n"
    "fmla v1.4s, v30.4s, v5.4s\n"
    "fmla v11.4s, v30.4s, v9.4s\n"
    "fmla v21.4s, v30.4s, v19.4s\n"
    "fmla v24.4s, v30.4s, v22.4s\n"
    "ldr q26, [x16, x27]\n"
    "fmla v17.4s, v31.4s, v6.4s\n"
    "ldr x16, [%[inptrs], 256]\n"
    "fmla v15.4s, v31.4s, v8.4s\n"
    "fmla v1.4s, v31.4s, v7.4s\n"
    "fmla v21.4s, v31.4s, v9.4s\n"
    "ldr q31, [x15, x27]\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "ldr x15, [%[inptrs], 216]\n"
    "fmla v10.4s, v26.4s, v5.4s\n"
    "ldr q29, [x7, x27]\n"
    "fmla v1.4s, v25.4s, v8.4s\n"
    "ldr q28, [x24, x27]\n"
    "fmla v13.4s, v31.4s, v3.4s\n"
    "ldr x7, [%[inptrs], 176]\n"
    "fmla v14.4s, v31.4s, v6.4s\n"
    "ldr x24, [%[inptrs], 136]\n"
    "fmla v12.4s, v31.4s, v4.4s\n"
    "fmla v10.4s, v31.4s, v7.4s\n"
    "fmla v11.4s, v31.4s, v5.4s\n"
    "fmla v20.4s, v31.4s, v19.4s\n"
    "fmla v0.4s, v29.4s, v3.4s\n"
    "ldr q25, [x17, x27]\n"
    "fmla v15.4s, v29.4s, v4.4s\n"
    "fmla v21.4s, v29.4s, v5.4s\n"
    "fmla v12.4s, v29.4s, v6.4s\n"
    "fmla v10.4s, v29.4s, v8.4s\n"
    "fmla v11.4s, v29.4s, v7.4s\n"
    "fmla v20.4s, v29.4s, v9.4s\n"
    "fmla v24.4s, v29.4s, v19.4s\n"
    "fmla v23.4s, v29.4s, v22.4s\n"
    "fmla v17.4s, v28.4s, v3.4s\n"
    "ldr q29, [x16, x27]\n"
    "fmla v15.4s, v28.4s, v6.4s\n"
    "ldr q22, [x15, x27]\n"
    "fmla v1.4s, v28.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 264]\n"
    "fmla v11.4s, v28.4s, v8.4s\n"
    "ldr x15, [%[inptrs], 224]\n"
    "fmla v21.4s, v28.4s, v7.4s\n"
    "fmla v24.4s, v28.4s, v9.4s\n"
    "fmla v14.4s, v29.4s, v3.4s\n"
    "ldr q27, [x7, x27]\n"
    "fmla v1.4s, v25.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 184]\n"
    "fmla v10.4s, v29.4s, v4.4s\n"
    "fmla v20.4s, v29.4s, v5.4s\n"
    "fmla v21.4s, v25.4s, v8.4s\n"
    "ldr q26, [x24, x27]\n"
    "fmla v12.4s, v22.4s, v3.4s\n"
    "ldr q25, [x16, x27]\n"
    "fmla v11.4s, v22.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 272]\n"
    "fmla v10.4s, v22.4s, v6.4s\n"
    "fmla v20.4s, v22.4s, v7.4s\n"
    "fmla v24.4s, v22.4s, v5.4s\n"
    "fmla v23.4s, v22.4s, v19.4s\n"
    "fmla v15.4s, v27.4s, v3.4s\n"
    "ldr q31, [x15, x27]\n"
    "fmla v11.4s, v27.4s, v6.4s\n"
    "ldr q22, [x7, x27]\n"
    "fmla v21.4s, v27.4s, v4.4s\n"
    "ldr x15, [%[inptrs], 232]\n"
    "fmla v20.4s, v27.4s, v8.4s\n"
    "fmla v24.4s, v27.4s, v7.4s\n"
    "fmla v23.4s, v27.4s, v9.4s\n"
    "ldr q19, [x16, x27]\n"
    "fmla v1.4s, v26.4s, v3.4s\n"
    "ldr q28, [x15, x27]\n"
    "fmla v21.4s, v26.4s, v6.4s\n"
    "ldr x16, [%[inptrs], 280]\n"
    "fmla v24.4s, v26.4s, v8.4s\n"
    "fmla v10.4s, v25.4s, v3.4s\n"
    "fmla v20.4s, v25.4s, v4.4s\n"
    "ldr q30, [x16, x27]\n"
    "fmla v23.4s, v25.4s, v5.4s\n"
    "add x27, x27, #16\n"
    "fmla v11.4s, v31.4s, v3.4s\n"
    "fmla v21.4s, v22.4s, v3.4s\n"
    "fmla v24.4s, v31.4s, v4.4s\n"
    "movi v29.16b, #0\n"
    "fmla v20.4s, v31.4s, v6.4s\n"
    "fmla v23.4s, v31.4s, v7.4s\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "fmax v18.4s, v18.4s, v29.4s\n"
    "fmla v24.4s, v22.4s, v6.4s\n"
    "fmax v17.4s, v17.4s, v29.4s\n"
    "fmla v20.4s, v19.4s, v3.4s\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "str q2, [x20, x28]\n"
    "fmla v23.4s, v22.4s, v8.4s\n"
    "fmax v16.4s, v16.4s, v29.4s\n"
    "ldr x20, [%[outptrs], 8]\n"
    "fmla v24.4s, v28.4s, v3.4s\n"
    "fmax v0.4s, v0.4s, v29.4s\n"
    "str q18, [x20, x28]\n"
    "fmax v15.4s, v15.4s, v29.4s\n"
    "str q16, [x21, x28]\n"
    "fmla v23.4s, v19.4s, v4.4s\n"
    "fmax v21.4s, v21.4s, v29.4s\n"
    "ldr x20, [%[outptrs], 16]\n"
    "fmax v13.4s, v13.4s, v29.4s\n"
    "ldr x21, [%[outptrs], 40]\n"
    "str q17, [x20, x28]\n"
    "fmax v12.4s, v12.4s, v29.4s\n"
    "str q0, [x21, x28]\n"
    "fmla v23.4s, v28.4s, v6.4s\n"
    "str q13, [x22, x28]\n"
    "fmax v11.4s, v11.4s, v29.4s\n"
    "fmax v24.4s, v24.4s, v29.4s\n"
    "ldr x20, [%[outptrs], 24]\n"
    "fmax v14.4s, v14.4s, v29.4s\n"
    "ldr x21, [%[outptrs], 48]\n"
    "str q1, [x20, x28]\n"
    "fmla v23.4s, v30.4s, v3.4s\n"
    "str q15, [x21, x28]\n"
    "fmax v10.4s, v10.4s, v29.4s\n"
    "str q14, [x23, x28]\n"
    "fmax v20.4s, v20.4s, v29.4s\n"
    "ldr x21, [%[outptrs], 56]\n"
    "ldr x22, [%[outptrs], 72]\n"
    "ldr x23, [%[outptrs], 104]\n"
    "fmax v23.4s, v23.4s, v29.4s\n"
    "str q21, [x21, x28]\n"
    "str q12, [x22, x28]\n"
    "str q10, [x23, x28]\n"
    "ldr x22, [%[outptrs], 80]\n"
    "ldr x23, [%[outptrs], 112]\n"
    "str q11, [x22, x28]\n"
    "str q20, [x23, x28]\n"
    "ldr x22, [%[outptrs], 88]\n"
    "ldr x23, [%[outptrs], 120]\n"
    "str q24, [x22, x28]\n"
    "str q23, [x23, x28]\n"
    "add x28, x28, #16\n"
    "4:\n"
    "cbz x19, 7f\n"
    "ldr s25, [%[wbptr]]\n"
    "mov v2.16b, v25.16b\n"
    "ldr s22, [%[wbptr], #4]\n"
    "mov v16.16b, v25.16b\n"
    "ldr s9, [%[wbptr], #8]\n"
    "mov v18.16b, v25.16b\n"
    "ldr s8, [%[wbptr], #12]\n"
    "mov v13.16b, v25.16b\n"
    "ldr s19, [%[wbptr], #16]\n"
    "mov v0.16b, v25.16b\n"
    "ldr s7, [%[wbptr], #20]\n"
    "mov v17.16b, v25.16b\n"
    "ldr s6, [%[wbptr], #24]\n"
    "mov v14.16b, v25.16b\n"
    "ldr s5, [%[wbptr], #28]\n"
    "mov v12.16b, v25.16b\n"
    "ldr s4, [%[wbptr], #32]\n"
    "mov v15.16b, v25.16b\n"
    "ldr s3, [%[wbptr], #36]\n"
    "ldr x25, [%[inptrs], 0]\n"
    "ldr x17, [%[inptrs], 48]\n"
    "ldr x24, [%[inptrs], 96]\n"
    "ldr x7, [%[inptrs], 144]\n"
    "subs x19, x19, #1\n"
    "ldr s27, [x25, x27]\n"
    "fmla v2.4s, v27.4s, v22.4s\n"
    "ldr s26, [x17, x27]\n"
    "fmla v16.4s, v26.4s, v22.4s\n"
    "ldr s28, [x24, x27]\n"
    "ldr s27, [x7, x27]\n"
    "ldr x25, [%[inptrs], 8]\n"
    "ldr x17, [%[inptrs], 56]\n"
    "ldr x24, [%[inptrs], 104]\n"
    "ldr s31, [x25, x27]\n"
    "fmla v2.4s, v26.4s, v19.4s\n"
    "ldr s29, [x17, x27]\n"
    "ldr s21, [x24, x27]\n"
    "ldr x25, [%[inptrs], 16]\n"
    "ldr s30, [x25, x27]\n"
    "fmla v2.4s, v31.4s, v9.4s\n"
    "beq 6f\n"
    "5:\n"
    "mov v1.16b, v25.16b\n"
    "ldr x17, [%[inptrs], 64]\n"
    "mov v10.16b, v25.16b\n"
    "ldr x25, [%[inptrs], 24]\n"
    "fmla v18.4s, v31.4s, v22.4s\n"
    "ldr s23, [x17, x27]\n"
    "fmla v2.4s, v28.4s, v5.4s\n"
    "ldr x15, [%[inptrs], 192]\n"
    "fmla v16.4s, v28.4s, v19.4s\n"
    "ldr x7, [%[inptrs], 152]\n"
    "fmla v13.4s, v28.4s, v22.4s\n"
    "ldr s26, [x25, x27]\n"
    "fmla v18.4s, v29.4s, v19.4s\n"
    "ldr x24, [%[inptrs], 112]\n"
    "fmla v2.4s, v29.4s, v7.4s\n"
    "ldr x17, [%[inptrs], 72]\n"
    "fmla v16.4s, v29.4s, v9.4s\n"
    "ldr x25, [%[inptrs], 32]\n"
    "fmla v0.4s, v29.4s, v22.4s\n"
    "ldr s28, [x15, x27]\n"
    "fmla v18.4s, v30.4s, v9.4s\n"
    "ldr x16, [%[inptrs], 240]\n"
    "fmla v2.4s, v30.4s, v8.4s\n"
    "ldr x15, [%[inptrs], 200]\n"
    "fmla v17.4s, v30.4s, v22.4s\n"
    "ldr s29, [x7, x27]\n"
    "fmla v16.4s, v27.4s, v5.4s\n"
    "ldr x7, [%[inptrs], 160]\n"
    "fmla v13.4s, v27.4s, v19.4s\n"
    "ldr x20, [%[outptrs], 0]\n"
    "fmla v14.4s, v27.4s, v22.4s\n"
    "ldr s20, [x24, x27]\n"
    "fmla v2.4s, v21.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 120]\n"
    "fmla v16.4s, v21.4s, v7.4s\n"
    "ldr x21, [%[outptrs], 32]\n"
    "fmla v18.4s, v21.4s, v5.4s\n"
    "ldr x22, [%[outptrs], 64]\n"
    "fmla v13.4s, v21.4s, v9.4s\n"
    "ldr x23, [%[outptrs], 96]\n"
    "fmla v0.4s, v21.4s, v19.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v12.4s, v21.4s, v22.4s\n"
    "ldr s24, [x17, x27]\n"
    "fmla v2.4s, v23.4s, v6.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v16.4s, v23.4s, v8.4s\n"
    "ldr x17, [%[inptrs], 80]\n"
    "fmla v18.4s, v23.4s, v7.4s\n"
    "subs x19, x19, #1\n"
    "fmla v0.4s, v23.4s, v9.4s\n"
    "fmla v17.4s, v23.4s, v19.4s\n"
    "fmla v15.4s, v23.4s, v22.4s\n"
    "ldr s23, [x25, x27]\n"
    "fmla v1.4s, v26.4s, v22.4s\n"
    "ldr x25, [%[inptrs], 40]\n"
    "fmla v18.4s, v26.4s, v8.4s\n"
    "fmla v13.4s, v28.4s, v5.4s\n"
    "fmla v17.4s, v26.4s, v9.4s\n"
    "ldr s30, [x16, x27]\n"
    "fmla v14.4s, v28.4s, v19.4s\n"
    "ldr s26, [x15, x27]\n"
    "fmla v16.4s, v29.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 248]\n"
    "fmla v13.4s, v29.4s, v7.4s\n"
    "ldr x15, [%[inptrs], 208]\n"
    "fmla v0.4s, v29.4s, v5.4s\n"
    "fmla v12.4s, v29.4s, v19.4s\n"
    "fmla v14.4s, v29.4s, v9.4s\n"
    "fmla v10.4s, v29.4s, v22.4s\n"
    "mov v11.16b, v25.16b\n"
    "fmla v2.4s, v20.4s, v3.4s\n"
    "fmla v16.4s, v20.4s, v6.4s\n"
    "fmla v18.4s, v20.4s, v4.4s\n"
    "fmla v13.4s, v20.4s, v8.4s\n"
    "fmla v0.4s, v20.4s, v7.4s\n"
    "fmla v17.4s, v20.4s, v5.4s\n"
    "fmla v12.4s, v20.4s, v9.4s\n"
    "fmla v15.4s, v20.4s, v19.4s\n"
    "fmla v11.4s, v20.4s, v22.4s\n"
    "mov v21.16b, v25.16b\n"
    "fmla v18.4s, v24.4s, v6.4s\n"
    "fmla v0.4s, v24.4s, v8.4s\n"
    "fmla v1.4s, v24.4s, v19.4s\n"
    "fmla v17.4s, v24.4s, v7.4s\n"
    "fmla v14.4s, v30.4s, v5.4s\n"
    "mov v20.16b, v25.16b\n"
    "fmla v15.4s, v24.4s, v9.4s\n"
    "fmla v21.4s, v24.4s, v22.4s\n"
    "ldr s27, [x7, x27]\n"
    "fmla v1.4s, v23.4s, v9.4s\n"
    "ldr x7, [%[inptrs], 168]\n"
    "fmla v17.4s, v23.4s, v8.4s\n"
    "ldr s30, [x24, x27]\n"
    "fmla v13.4s, v26.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 128]\n"
    "fmla v14.4s, v26.4s, v7.4s\n"
    "fmla v12.4s, v26.4s, v5.4s\n"
    "fmla v10.4s, v26.4s, v19.4s\n"
    "ldr s31, [x17, x27]\n"
    "fmla v16.4s, v27.4s, v3.4s\n"
    "ldr x17, [%[inptrs], 88]\n"
    "fmla v13.4s, v27.4s, v6.4s\n"
    "fmla v0.4s, v27.4s, v4.4s\n"
    "fmla v14.4s, v27.4s, v8.4s\n"
    "fmla v12.4s, v27.4s, v7.4s\n"
    "fmla v15.4s, v27.4s, v5.4s\n"
    "fmla v10.4s, v27.4s, v9.4s\n"
    "fmla v11.4s, v27.4s, v19.4s\n"
    "fmla v20.4s, v27.4s, v22.4s\n"
    "mov v24.16b, v25.16b\n"
    "mov v23.16b, v25.16b\n"
    "fmla v18.4s, v30.4s, v3.4s\n"
    "fmla v0.4s, v30.4s, v6.4s\n"
    "fmla v17.4s, v30.4s, v4.4s\n"
    "fmla v12.4s, v30.4s, v8.4s\n"
    "fmla v15.4s, v30.4s, v7.4s\n"
    "fmla v1.4s, v30.4s, v5.4s\n"
    "fmla v11.4s, v30.4s, v9.4s\n"
    "fmla v21.4s, v30.4s, v19.4s\n"
    "fmla v24.4s, v30.4s, v22.4s\n"
    "ldr s25, [x25, x27]\n"
    "fmla v17.4s, v31.4s, v6.4s\n"
    "ldr x25, [%[inptrs], 0]\n"
    "fmla v15.4s, v31.4s, v8.4s\n"
    "fmla v1.4s, v31.4s, v7.4s\n"
    "fmla v21.4s, v31.4s, v9.4s\n"
    "ldr s26, [x16, x27]\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 256]\n"
    "fmla v10.4s, v26.4s, v5.4s\n"
    "ldr s31, [x15, x27]\n"
    "fmla v1.4s, v25.4s, v8.4s\n"
    "ldr s29, [x7, x27]\n"
    "fmla v13.4s, v31.4s, v3.4s\n"
    "ldr x15, [%[inptrs], 216]\n"
    "fmla v14.4s, v31.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 176]\n"
    "fmla v12.4s, v31.4s, v4.4s\n"
    "fmla v10.4s, v31.4s, v7.4s\n"
    "fmla v11.4s, v31.4s, v5.4s\n"
    "fmla v20.4s, v31.4s, v19.4s\n"
    "fmla v0.4s, v29.4s, v3.4s\n"
    "ldr s28, [x24, x27]\n"
    "fmla v15.4s, v29.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 136]\n"
    "fmla v12.4s, v29.4s, v6.4s\n"
    "fmla v10.4s, v29.4s, v8.4s\n"
    "fmla v11.4s, v29.4s, v7.4s\n"
    "fmla v21.4s, v29.4s, v5.4s\n"
    "fmla v20.4s, v29.4s, v9.4s\n"
    "fmla v24.4s, v29.4s, v19.4s\n"
    "fmla v23.4s, v29.4s, v22.4s\n"
    "ldr s25, [x17, x27]\n"
    "fmla v17.4s, v28.4s, v3.4s\n"
    "ldr s29, [x16, x27]\n"
    "fmla v15.4s, v28.4s, v6.4s\n"
    "ldr x16, [%[inptrs], 264]\n"
    "fmla v1.4s, v28.4s, v4.4s\n"
    "ldr x17, [%[inptrs], 48]\n"
    "fmla v11.4s, v28.4s, v8.4s\n"
    "fmla v21.4s, v28.4s, v7.4s\n"
    "fmla v24.4s, v28.4s, v9.4s\n"
    "ldr s22, [x15, x27]\n"
    "fmla v14.4s, v29.4s, v3.4s\n"
    "ldr x15, [%[inptrs], 224]\n"
    "fmla v1.4s, v25.4s, v6.4s\n"
    "fmla v10.4s, v29.4s, v4.4s\n"
    "fmla v21.4s, v25.4s, v8.4s\n"
    "ldr s27, [x7, x27]\n"
    "fmla v20.4s, v29.4s, v5.4s\n"
    "ldr s26, [x24, x27]\n"
    "fmla v12.4s, v22.4s, v3.4s\n"
    "ldr x7, [%[inptrs], 184]\n"
    "fmla v10.4s, v22.4s, v6.4s\n"
    "ldr x24, [%[inptrs], 96]\n"
    "fmla v11.4s, v22.4s, v4.4s\n"
    "fmla v24.4s, v22.4s, v5.4s\n"
    "fmla v20.4s, v22.4s, v7.4s\n"
    "fmla v23.4s, v22.4s, v19.4s\n"
    "fmla v15.4s, v27.4s, v3.4s\n"
    "ldr s25, [x16, x27]\n"
    "fmla v21.4s, v27.4s, v4.4s\n"
    "ldr s31, [x15, x27]\n"
    "fmla v11.4s, v27.4s, v6.4s\n"
    "ldr x16, [%[inptrs], 272]\n"
    "fmla v20.4s, v27.4s, v8.4s\n"
    "ldr x15, [%[inptrs], 232]\n"
    "fmla v24.4s, v27.4s, v7.4s\n"
    "fmla v23.4s, v27.4s, v9.4s\n"
    "fmla v1.4s, v26.4s, v3.4s\n"
    "ldr s22, [x7, x27]\n"
    "fmla v21.4s, v26.4s, v6.4s\n"
    "ldr s19, [x16, x27]\n"
    "fmla v10.4s, v25.4s, v3.4s\n"
    "ldr x16, [%[inptrs], 280]\n"
    "fmla v24.4s, v26.4s, v8.4s\n"
    "ldr s28, [x15, x27]\n"
    "fmla v20.4s, v25.4s, v4.4s\n"
    "ldr x7, [%[inptrs], 144]\n"
    "fmla v23.4s, v25.4s, v5.4s\n"
    "ldr s30, [x16, x27]\n"
    "fmla v11.4s, v31.4s, v3.4s\n"
    "add x27, x27, #4\n"
    "fmla v24.4s, v31.4s, v4.4s\n"
    "ldr s27, [x25, x27]\n"
    "fmla v20.4s, v31.4s, v6.4s\n"
    "ldr x25, [%[inptrs], 8]\n"
    "fmla v23.4s, v31.4s, v7.4s\n"
    "movi v29.16b, #0\n"
    "fmla v21.4s, v22.4s, v3.4s\n"
    "ldr s26, [x17, x27]\n"
    "fmla v24.4s, v22.4s, v6.4s\n"
    "ldr x17, [%[inptrs], 56]\n"
    "fmla v20.4s, v19.4s, v3.4s\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "fmla v23.4s, v22.4s, v8.4s\n"
    "ldr s25, [%[wbptr]]\n"
    "fmax v18.4s, v18.4s, v29.4s\n"
    "ldr s22, [%[wbptr], #4]\n"
    "str s2, [x20, x28]\n"
    "fmla v24.4s, v28.4s, v3.4s\n"
    "fmax v17.4s, v17.4s, v29.4s\n"
    "ldr s9, [%[wbptr], #8]\n"
    "fmla v23.4s, v19.4s, v4.4s\n"
    "ldr s8, [%[wbptr], #12]\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "ldr s19, [%[wbptr], #16]\n"
    "fmax v16.4s, v16.4s, v29.4s\n"
    "ldr x20, [%[outptrs], 8]\n"
    "fmax v0.4s, v0.4s, v29.4s\n"
    "fmax v15.4s, v15.4s, v29.4s\n"
    "str s18, [x20, x28]\n"
    "fmla v23.4s, v28.4s, v6.4s\n"
    "str s16, [x21, x28]\n"
    "fmax v21.4s, v21.4s, v29.4s\n"
    "fmax v13.4s, v13.4s, v29.4s\n"
    "ldr s7, [%[wbptr], #20]\n"
    "fmax v12.4s, v12.4s, v29.4s\n"
    "ldr s5, [%[wbptr], #28]\n"
    "fmla v23.4s, v30.4s, v3.4s\n"
    "ldr s6, [%[wbptr], #24]\n"
    "str s13, [x22, x28]\n"
    "fmax v11.4s, v11.4s, v29.4s\n"
    "fmax v24.4s, v24.4s, v29.4s\n"
    "ldr s4, [%[wbptr], #32]\n"
    "fmax v14.4s, v14.4s, v29.4s\n"
    "ldr s31, [x25, x27]\n"
    "fmax v10.4s, v10.4s, v29.4s\n"
    "ldr s3, [%[wbptr], #36]\n"
    "fmax v20.4s, v20.4s, v29.4s\n"
    "ldr s28, [x24, x27]\n"
    "str s14, [x23, x28]\n"
    "fmax v23.4s, v23.4s, v29.4s\n"
    "mov v2.16b, v25.16b\n"
    "ldr s29, [x17, x27]\n"
    "ldr x20, [%[outptrs], 16]\n"
    "ldr x21, [%[outptrs], 40]\n"
    "ldr x22, [%[outptrs], 72]\n"
    "ldr x23, [%[outptrs], 104]\n"
    "ldr x25, [%[inptrs], 16]\n"
    "ldr x24, [%[inptrs], 104]\n"
    "str s17, [x20, x28]\n"
    "mov v16.16b, v25.16b\n"
    "str s0, [x21, x28]\n"
    "mov v18.16b, v25.16b\n"
    "str s12, [x22, x28]\n"
    "mov v13.16b, v25.16b\n"
    "str s10, [x23, x28]\n"
    "mov v0.16b, v25.16b\n"
    "fmla v2.4s, v27.4s, v22.4s\n"
    "ldr s30, [x25, x27]\n"
    "fmla v16.4s, v26.4s, v22.4s\n"
    "ldr x20, [%[outptrs], 24]\n"
    "mov v17.16b, v25.16b\n"
    "ldr x21, [%[outptrs], 48]\n"
    "str s1, [x20, x28]\n"
    "mov v14.16b, v25.16b\n"
    "str s15, [x21, x28]\n"
    "mov v12.16b, v25.16b\n"
    "mov v15.16b, v25.16b\n"
    "ldr x21, [%[outptrs], 56]\n"
    "fmla v2.4s, v26.4s, v19.4s\n"
    "ldr s27, [x7, x27]\n"
    "str s21, [x21, x28]\n"
    "ldr x22, [%[outptrs], 80]\n"
    "ldr s21, [x24, x27]\n"
    "ldr x23, [%[outptrs], 112]\n"
    "str s11, [x22, x28]\n"
    "fmla v2.4s, v31.4s, v9.4s\n"
    "str s20, [x23, x28]\n"
    "ldr x22, [%[outptrs], 88]\n"
    "ldr x23, [%[outptrs], 120]\n"
    "str s24, [x22, x28]\n"
    "str s23, [x23, x28]\n"
    "add x28, x28, #4\n"
    "bne 5b\n"
    "6:\n"
    "mov v1.16b, v25.16b\n"
    "ldr x17, [%[inptrs], 64]\n"
    "mov v10.16b, v25.16b\n"
    "ldr x25, [%[inptrs], 24]\n"
    "mov v11.16b, v25.16b\n"
    "ldr x15, [%[inptrs], 192]\n"
    "fmla v18.4s, v31.4s, v22.4s\n"
    "ldr s23, [x17, x27]\n"
    "fmla v2.4s, v28.4s, v5.4s\n"
    "ldr x7, [%[inptrs], 152]\n"
    "fmla v16.4s, v28.4s, v19.4s\n"
    "ldr x24, [%[inptrs], 112]\n"
    "fmla v13.4s, v28.4s, v22.4s\n"
    "ldr s26, [x25, x27]\n"
    "fmla v18.4s, v29.4s, v19.4s\n"
    "ldr x17, [%[inptrs], 72]\n"
    "fmla v2.4s, v29.4s, v7.4s\n"
    "ldr x25, [%[inptrs], 32]\n"
    "fmla v16.4s, v29.4s, v9.4s\n"
    "ldr x16, [%[inptrs], 240]\n"
    "fmla v0.4s, v29.4s, v22.4s\n"
    "ldr s28, [x15, x27]\n"
    "fmla v18.4s, v30.4s, v9.4s\n"
    "ldr x15, [%[inptrs], 200]\n"
    "fmla v2.4s, v30.4s, v8.4s\n"
    "ldr x20, [%[outptrs], 0]\n"
    "fmla v17.4s, v30.4s, v22.4s\n"
    "ldr s29, [x7, x27]\n"
    "fmla v16.4s, v27.4s, v5.4s\n"
    "ldr x7, [%[inptrs], 160]\n"
    "fmla v13.4s, v27.4s, v19.4s\n"
    "ldr x21, [%[outptrs], 32]\n"
    "fmla v14.4s, v27.4s, v22.4s\n"
    "ldr s20, [x24, x27]\n"
    "fmla v2.4s, v21.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 120]\n"
    "fmla v16.4s, v21.4s, v7.4s\n"
    "ldr x22, [%[outptrs], 64]\n"
    "fmla v18.4s, v21.4s, v5.4s\n"
    "ldr x23, [%[outptrs], 96]\n"
    "fmla v13.4s, v21.4s, v9.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v0.4s, v21.4s, v19.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v12.4s, v21.4s, v22.4s\n"
    "ldr s24, [x17, x27]\n"
    "fmla v2.4s, v23.4s, v6.4s\n"
    "ldr x17, [%[inptrs], 80]\n"
    "fmla v16.4s, v23.4s, v8.4s\n"
    "fmla v18.4s, v23.4s, v7.4s\n"
    "fmla v0.4s, v23.4s, v9.4s\n"
    "fmla v17.4s, v23.4s, v19.4s\n"
    "fmla v15.4s, v23.4s, v22.4s\n"
    "ldr s23, [x25, x27]\n"
    "fmla v1.4s, v26.4s, v22.4s\n"
    "ldr x25, [%[inptrs], 40]\n"
    "fmla v18.4s, v26.4s, v8.4s\n"
    "fmla v13.4s, v28.4s, v5.4s\n"
    "fmla v17.4s, v26.4s, v9.4s\n"
    "ldr s30, [x16, x27]\n"
    "fmla v14.4s, v28.4s, v19.4s\n"
    "ldr s26, [x15, x27]\n"
    "fmla v16.4s, v29.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 248]\n"
    "fmla v13.4s, v29.4s, v7.4s\n"
    "ldr x15, [%[inptrs], 208]\n"
    "fmla v0.4s, v29.4s, v5.4s\n"
    "fmla v12.4s, v29.4s, v19.4s\n"
    "fmla v14.4s, v29.4s, v9.4s\n"
    "fmla v10.4s, v29.4s, v22.4s\n"
    "mov v21.16b, v25.16b\n"
    "fmla v2.4s, v20.4s, v3.4s\n"
    "fmla v16.4s, v20.4s, v6.4s\n"
    "fmla v18.4s, v20.4s, v4.4s\n"
    "fmla v13.4s, v20.4s, v8.4s\n"
    "fmla v0.4s, v20.4s, v7.4s\n"
    "fmla v17.4s, v20.4s, v5.4s\n"
    "fmla v12.4s, v20.4s, v9.4s\n"
    "fmla v15.4s, v20.4s, v19.4s\n"
    "fmla v11.4s, v20.4s, v22.4s\n"
    "mov v20.16b, v25.16b\n"
    "fmla v18.4s, v24.4s, v6.4s\n"
    "fmla v0.4s, v24.4s, v8.4s\n"
    "fmla v1.4s, v24.4s, v19.4s\n"
    "fmla v17.4s, v24.4s, v7.4s\n"
    "fmla v21.4s, v24.4s, v22.4s\n"
    "fmla v15.4s, v24.4s, v9.4s\n"
    "ldr s27, [x7, x27]\n"
    "fmla v14.4s, v30.4s, v5.4s\n"
    "ldr s30, [x24, x27]\n"
    "fmla v1.4s, v23.4s, v9.4s\n"
    "ldr x7, [%[inptrs], 168]\n"
    "fmla v17.4s, v23.4s, v8.4s\n"
    "ldr s31, [x17, x27]\n"
    "fmla v13.4s, v26.4s, v4.4s\n"
    "ldr x24, [%[inptrs], 128]\n"
    "fmla v14.4s, v26.4s, v7.4s\n"
    "ldr x17, [%[inptrs], 88]\n"
    "fmla v12.4s, v26.4s, v5.4s\n"
    "fmla v10.4s, v26.4s, v19.4s\n"
    "mov v24.16b, v25.16b\n"
    "mov v23.16b, v25.16b\n"
    "fmla v16.4s, v27.4s, v3.4s\n"
    "fmla v13.4s, v27.4s, v6.4s\n"
    "fmla v0.4s, v27.4s, v4.4s\n"
    "fmla v14.4s, v27.4s, v8.4s\n"
    "fmla v12.4s, v27.4s, v7.4s\n"
    "fmla v15.4s, v27.4s, v5.4s\n"
    "fmla v10.4s, v27.4s, v9.4s\n"
    "fmla v11.4s, v27.4s, v19.4s\n"
    "fmla v20.4s, v27.4s, v22.4s\n"
    "ldr s25, [x25, x27]\n"
    "fmla v18.4s, v30.4s, v3.4s\n"
    "fmla v0.4s, v30.4s, v6.4s\n"
    "fmla v17.4s, v30.4s, v4.4s\n"
    "fmla v12.4s, v30.4s, v8.4s\n"
    "fmla v15.4s, v30.4s, v7.4s\n"
    "fmla v1.4s, v30.4s, v5.4s\n"
    "fmla v11.4s, v30.4s, v9.4s\n"
    "fmla v21.4s, v30.4s, v19.4s\n"
    "fmla v24.4s, v30.4s, v22.4s\n"
    "ldr s26, [x16, x27]\n"
    "fmla v17.4s, v31.4s, v6.4s\n"
    "ldr x16, [%[inptrs], 256]\n"
    "fmla v15.4s, v31.4s, v8.4s\n"
    "fmla v1.4s, v31.4s, v7.4s\n"
    "fmla v21.4s, v31.4s, v9.4s\n"
    "ldr s31, [x15, x27]\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "ldr x15, [%[inptrs], 216]\n"
    "fmla v10.4s, v26.4s, v5.4s\n"
    "ldr s29, [x7, x27]\n"
    "fmla v1.4s, v25.4s, v8.4s\n"
    "ldr s28, [x24, x27]\n"
    "fmla v13.4s, v31.4s, v3.4s\n"
    "ldr x7, [%[inptrs], 176]\n"
    "fmla v14.4s, v31.4s, v6.4s\n"
    "ldr x24, [%[inptrs], 136]\n"
    "fmla v12.4s, v31.4s, v4.4s\n"
    "fmla v10.4s, v31.4s, v7.4s\n"
    "fmla v11.4s, v31.4s, v5.4s\n"
    "fmla v20.4s, v31.4s, v19.4s\n"
    "fmla v0.4s, v29.4s, v3.4s\n"
    "ldr s25, [x17, x27]\n"
    "fmla v15.4s, v29.4s, v4.4s\n"
    "fmla v21.4s, v29.4s, v5.4s\n"
    "fmla v12.4s, v29.4s, v6.4s\n"
    "fmla v10.4s, v29.4s, v8.4s\n"
    "fmla v11.4s, v29.4s, v7.4s\n"
    "fmla v20.4s, v29.4s, v9.4s\n"
    "fmla v24.4s, v29.4s, v19.4s\n"
    "fmla v23.4s, v29.4s, v22.4s\n"
    "fmla v17.4s, v28.4s, v3.4s\n"
    "ldr s29, [x16, x27]\n"
    "fmla v15.4s, v28.4s, v6.4s\n"
    "ldr s22, [x15, x27]\n"
    "fmla v1.4s, v28.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 264]\n"
    "fmla v11.4s, v28.4s, v8.4s\n"
    "ldr x15, [%[inptrs], 224]\n"
    "fmla v21.4s, v28.4s, v7.4s\n"
    "fmla v24.4s, v28.4s, v9.4s\n"
    "fmla v14.4s, v29.4s, v3.4s\n"
    "ldr s27, [x7, x27]\n"
    "fmla v1.4s, v25.4s, v6.4s\n"
    "ldr x7, [%[inptrs], 184]\n"
    "fmla v10.4s, v29.4s, v4.4s\n"
    "fmla v20.4s, v29.4s, v5.4s\n"
    "fmla v21.4s, v25.4s, v8.4s\n"
    "ldr s26, [x24, x27]\n"
    "fmla v12.4s, v22.4s, v3.4s\n"
    "ldr s25, [x16, x27]\n"
    "fmla v11.4s, v22.4s, v4.4s\n"
    "ldr x16, [%[inptrs], 272]\n"
    "fmla v10.4s, v22.4s, v6.4s\n"
    "fmla v20.4s, v22.4s, v7.4s\n"
    "fmla v24.4s, v22.4s, v5.4s\n"
    "fmla v23.4s, v22.4s, v19.4s\n"
    "fmla v15.4s, v27.4s, v3.4s\n"
    "ldr s31, [x15, x27]\n"
    "fmla v11.4s, v27.4s, v6.4s\n"
    "ldr s22, [x7, x27]\n"
    "fmla v21.4s, v27.4s, v4.4s\n"
    "ldr x15, [%[inptrs], 232]\n"
    "fmla v20.4s, v27.4s, v8.4s\n"
    "fmla v24.4s, v27.4s, v7.4s\n"
    "fmla v23.4s, v27.4s, v9.4s\n"
    "ldr s19, [x16, x27]\n"
    "fmla v1.4s, v26.4s, v3.4s\n"
    "ldr s28, [x15, x27]\n"
    "fmla v21.4s, v26.4s, v6.4s\n"
    "ldr x16, [%[inptrs], 280]\n"
    "fmla v24.4s, v26.4s, v8.4s\n"
    "fmla v10.4s, v25.4s, v3.4s\n"
    "fmla v20.4s, v25.4s, v4.4s\n"
    "ldr s30, [x16, x27]\n"
    "fmla v23.4s, v25.4s, v5.4s\n"
    "add x27, x27, #4\n"
    "fmla v11.4s, v31.4s, v3.4s\n"
    "fmla v21.4s, v22.4s, v3.4s\n"
    "fmla v24.4s, v31.4s, v4.4s\n"
    "movi v29.16b, #0\n"
    "fmla v20.4s, v31.4s, v6.4s\n"
    "fmla v23.4s, v31.4s, v7.4s\n"
    "fmax v2.4s, v2.4s, v29.4s\n"
    "fmax v18.4s, v18.4s, v29.4s\n"
    "fmla v24.4s, v22.4s, v6.4s\n"
    "fmax v17.4s, v17.4s, v29.4s\n"
    "fmla v20.4s, v19.4s, v3.4s\n"
    "fmax v1.4s, v1.4s, v29.4s\n"
    "str s2, [x20, x28]\n"
    "fmla v23.4s, v22.4s, v8.4s\n"
    "fmax v16.4s, v16.4s, v29.4s\n"
    "ldr x20, [%[outptrs], 8]\n"
    "fmla v24.4s, v28.4s, v3.4s\n"
    "fmax v0.4s, v0.4s, v29.4s\n"
    "str s18, [x20, x28]\n"
    "fmax v15.4s, v15.4s, v29.4s\n"
    "str s16, [x21, x28]\n"
    "fmla v23.4s, v19.4s, v4.4s\n"
    "fmax v21.4s, v21.4s, v29.4s\n"
    "ldr x20, [%[outptrs], 16]\n"
    "fmax v13.4s, v13.4s, v29.4s\n"
    "ldr x21, [%[outptrs], 40]\n"
    "str s17, [x20, x28]\n"
    "fmax v12.4s, v12.4s, v29.4s\n"
    "str s0, [x21, x28]\n"
    "fmla v23.4s, v28.4s, v6.4s\n"
    "str s13, [x22, x28]\n"
    "fmax v11.4s, v11.4s, v29.4s\n"
    "fmax v24.4s, v24.4s, v29.4s\n"
    "ldr x20, [%[outptrs], 24]\n"
    "fmax v14.4s, v14.4s, v29.4s\n"
    "ldr x21, [%[outptrs], 48]\n"
    "str s1, [x20, x28]\n"
    "fmla v23.4s, v30.4s, v3.4s\n"
    "str s15, [x21, x28]\n"
    "fmax v10.4s, v10.4s, v29.4s\n"
    "str s14, [x23, x28]\n"
    "fmax v20.4s, v20.4s, v29.4s\n"
    "ldr x21, [%[outptrs], 56]\n"
    "ldr x22, [%[outptrs], 72]\n"
    "ldr x23, [%[outptrs], 104]\n"
    "fmax v23.4s, v23.4s, v29.4s\n"
    "str s21, [x21, x28]\n"
    "str s12, [x22, x28]\n"
    "str s10, [x23, x28]\n"
    "ldr x22, [%[outptrs], 80]\n"
    "ldr x23, [%[outptrs], 112]\n"
    "str s11, [x22, x28]\n"
    "str s20, [x23, x28]\n"
    "ldr x22, [%[outptrs], 88]\n"
    "ldr x23, [%[outptrs], 120]\n"
    "str s24, [x22, x28]\n"
    "str s23, [x23, x28]\n"
    "add x28, x28, #4\n"
    "7:\n"
    : [wbptr] "+r" (weight_bias_ptr)
    : [n_channels] "r" ((long) n_channels), [outptrs] "r" (outptrs), [inptrs] "r" (inptrs)
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v31", "v4", "v5", "v6", "v7", "v8", "v9", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
  );
}

template <>
template <>
void Conv::execute_tile<ActivationFunction::ReLU6>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *input,
  const unsigned int input_row_stride,
  const unsigned int input_col_stride,
  float *output,
  const unsigned int output_row_stride,
  const unsigned int output_col_stride
)
{
  __asm __volatile(
    "add x24, %[inptr0], %[input_row_stride]\n"
    "add x13, %[input_col_stride1], %[input_col_stride1]\n"
    "add x8, %[outptr0], %[output_row_stride]\n"
    "add x9, x24, %[input_row_stride]\n"
    "add x10, x13, #64\n"
    "add x19, x13, %[input_col_stride1]\n"
    "add x20, x9, %[input_row_stride]\n"
    "add x21, x19, #64\n"
    "add x17, x19, %[input_col_stride1]\n"
    "add x22, x20, %[input_row_stride]\n"
    "add x7, x17, #64\n"
    "add x11, x17, %[input_col_stride1]\n"
    "add x23, x22, %[input_row_stride]\n"
    "add x12, x11, #64\n"
    "add x25, x8, %[output_row_stride]\n"
    "add x26, x25, %[output_row_stride]\n"
    "add x27, %[output_col_stride1], %[output_col_stride1]\n"
    "and x14, %[n_channels], #3\n"
    "add x28, x27, %[output_col_stride1]\n"
    "lsr x15, %[n_channels], #2\n"
    "cbz x15, 4f\n"
    "1:\n"
    "ldr q23, [%[wbptr]]\n"
    "subs x15, x15, #1\n"
    "mov v12.16b, v23.16b\n"
    "ldr q20, [%[wbptr], #16]\n"
    "mov v8.16b, v23.16b\n"
    "ldr q6, [%[wbptr], #32]\n"
    "mov v11.16b, v23.16b\n"
    "ldr q5, [%[wbptr], #48]\n"
    "mov v16.16b, v23.16b\n"
    "ldr q19, [%[wbptr], #64]\n"
    "mov v7.16b, v23.16b\n"
    "ldr q4, [%[wbptr], #80]\n"
    "mov v10.16b, v23.16b\n"
    "ldr q3, [%[wbptr], #96]\n"
    "mov v14.16b, v23.16b\n"
    "ldr q2, [%[wbptr], #112]\n"
    "mov v15.16b, v23.16b\n"
    "ldr q1, [%[wbptr], #128]\n"
    "mov v17.16b, v23.16b\n"
    "ldr q0, [%[wbptr], #144]\n"
    "mov v9.16b, v23.16b\n"
    "ldr q28, [%[inptr0]]\n"
    "fmla v12.4s, v28.4s, v20.4s\n"
    "ldr q25, [x24]\n"
    "fmla v8.4s, v25.4s, v20.4s\n"
    "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v11.4s, v18.4s, v20.4s\n"
    "ldr q30, [x9]\n"
    "fmla v12.4s, v25.4s, v19.4s\n"
    "ldr q29, [x24, %[input_col_stride1]]\n"
    "fmla v8.4s, v30.4s, v19.4s\n"
    "ldr q24, [%[inptr0], x13]\n"
    "fmla v16.4s, v30.4s, v20.4s\n"
    "ldr q27, [x20]\n"
    "fmla v12.4s, v18.4s, v6.4s\n"
    "ldr q22, [x9, %[input_col_stride1]]\n"
    "fmla v8.4s, v29.4s, v6.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x24, #64]\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v12.4s, v30.4s, v2.4s\n"
    "prfm pldl1keep, [x9, #64]\n"
    "prfm pldl1keep, [x24, x16]\n"
    "prfm pldl1keep, [%[inptr0], x10]\n"
    "prfm pldl1keep, [x20, #64]\n"
    "prfm pldl1keep, [x9, x16]\n"
    "fmla v12.4s, v29.4s, v4.4s\n"
    "beq 3f\n"
    "2:\n"
    "mov v13.16b, v23.16b\n"
    "ldr q21, [x24, x13]\n"
    "mov v18.16b, v23.16b\n"
    "prfm pldl1keep, [x24, x10]\n"
    "fmla v11.4s, v29.4s, v19.4s\n"
    "prfm pldl1keep, [%[inptr0], x21]\n"
    "fmla v7.4s, v29.4s, v20.4s\n"
    "ldr q25, [%[inptr0], x19]\n"
    "fmla v12.4s, v24.4s, v5.4s\n"
    "prfm pldl1keep, [x22, #64]\n"
    "fmla v11.4s, v24.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x16]\n"
    "fmla v10.4s, v24.4s, v20.4s\n"
    "ldr q24, [x22]\n"
    "fmla v8.4s, v27.4s, v2.4s\n"
    "prfm pldl1keep, [x9, x10]\n"
    "fmla v16.4s, v27.4s, v19.4s\n"
    "prfm pldl1keep, [x24, x21]\n"
    "fmla v14.4s, v27.4s, v20.4s\n"
    "ldr q26, [x20, %[input_col_stride1]]\n"
    "fmla v12.4s, v22.4s, v1.4s\n"
    "prfm pldl1keep, [%[inptr0], x7]\n"
    "fmla v8.4s, v22.4s, v4.4s\n"
    "prfm pldl1keep, [x23, #64]\n"
    "fmla v11.4s, v22.4s, v2.4s\n"
    "prfm pldl1keep, [x22, x16]\n"
    "fmla v16.4s, v22.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x10]\n"
    "fmla v7.4s, v22.4s, v19.4s\n"
    "prfm pldl1keep, [x9, x21]\n"
    "fmla v15.4s, v22.4s, v20.4s\n"
    "ldr q30, [x9, x13]\n"
    "fmla v12.4s, v21.4s, v3.4s\n"
    "prfm pldl1keep, [x24, x7]\n"
    "fmla v8.4s, v21.4s, v5.4s\n"
    "prfm pldl1keep, [%[inptr0], x12]\n"
    "fmla v11.4s, v21.4s, v4.4s\n"
    "prfm pldl1keep, [x23, x16]\n"
    "fmla v7.4s, v21.4s, v6.4s\n"
    "prfm pldl1keep, [x22, x10]\n"
    "fmla v10.4s, v21.4s, v19.4s\n"
    "prfm pldl1keep, [x20, x21]\n"
    "fmla v17.4s, v21.4s, v20.4s\n"
    "ldr q22, [x24, x19]\n"
    "fmla v11.4s, v25.4s, v5.4s\n"
    "prfm pldl1keep, [x9, x7]\n"
    "fmla v10.4s, v25.4s, v6.4s\n"
    "prfm pldl1keep, [x24, x12]\n"
    "fmla v9.4s, v25.4s, v20.4s\n"
    "ldr q21, [%[inptr0], x17]\n"
    "fmla v16.4s, v24.4s, v2.4s\n"
    "prfm pldl1keep, [x23, x10]\n"
    "fmla v14.4s, v24.4s, v19.4s\n"
    "ldr q24, [x23]\n"
    "fmla v8.4s, v26.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x21]\n"
    "fmla v16.4s, v26.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x7]\n"
    "fmla v7.4s, v26.4s, v2.4s\n"
    "prfm pldl1keep, [x9, x12]\n"
    "fmla v14.4s, v26.4s, v6.4s\n"
    "prfm pldl1keep, [x23, x21]\n"
    "fmla v15.4s, v26.4s, v19.4s\n"
    "prfm pldl1keep, [x22, x7]\n"
    "fmla v13.4s, v26.4s, v20.4s\n"
    "ldr q26, [x22, %[input_col_stride1]]\n"
    "fmla v12.4s, v30.4s, v0.4s\n"
    "prfm pldl1keep, [x20, x12]\n"
    "fmla v8.4s, v30.4s, v3.4s\n"
    "prfm pldl1keep, [x23, x7]\n"
    "fmla v11.4s, v30.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x12]\n"
    "fmla v16.4s, v30.4s, v5.4s\n"
    "prfm pldl1keep, [x23, x12]\n"
    "fmla v7.4s, v30.4s, v4.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v10.4s, v30.4s, v2.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v15.4s, v30.4s, v6.4s\n"
    "subs x15, x15, #1\n"
    "fmla v17.4s, v30.4s, v19.4s\n"
    "fmla v18.4s, v30.4s, v20.4s\n"
    "mov v25.16b, v23.16b\n"
    "fmla v11.4s, v22.4s, v3.4s\n"
    "fmla v7.4s, v22.4s, v5.4s\n"
    "fmla v10.4s, v22.4s, v4.4s\n"
    "fmla v17.4s, v22.4s, v6.4s\n"
    "fmla v9.4s, v22.4s, v19.4s\n"
    "fmla v25.4s, v22.4s, v20.4s\n"
    "ldr q27, [x20, x13]\n"
    "fmla v10.4s, v21.4s, v5.4s\n"
    "fmla v14.4s, v24.4s, v2.4s\n"
    "mov v22.16b, v23.16b\n"
    "fmla v9.4s, v21.4s, v6.4s\n"
    "mov v24.16b, v23.16b\n"
    "mov v21.16b, v23.16b\n"
    "fmla v16.4s, v26.4s, v1.4s\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "fmla v15.4s, v26.4s, v2.4s\n"
    "fmla v13.4s, v26.4s, v19.4s\n"
    "fmla v8.4s, v27.4s, v0.4s\n"
    "ldr q28, [x9, x19]\n"
    "fmla v16.4s, v27.4s, v3.4s\n"
    "fmla v7.4s, v27.4s, v1.4s\n"
    "fmla v14.4s, v27.4s, v5.4s\n"
    "fmla v15.4s, v27.4s, v4.4s\n"
    "fmla v17.4s, v27.4s, v2.4s\n"
    "fmla v13.4s, v27.4s, v6.4s\n"
    "fmla v18.4s, v27.4s, v19.4s\n"
    "fmla v22.4s, v27.4s, v20.4s\n"
    "fmla v11.4s, v28.4s, v0.4s\n"
    "ldr q29, [x24, x17]\n"
    "fmla v7.4s, v28.4s, v3.4s\n"
    "fmla v10.4s, v28.4s, v1.4s\n"
    "fmla v15.4s, v28.4s, v5.4s\n"
    "fmla v17.4s, v28.4s, v4.4s\n"
    "fmla v9.4s, v28.4s, v2.4s\n"
    "fmla v18.4s, v28.4s, v6.4s\n"
    "fmla v25.4s, v28.4s, v19.4s\n"
    "fmla v24.4s, v28.4s, v20.4s\n"
    "fmla v10.4s, v29.4s, v3.4s\n"
    "ldr q23, [%[inptr0], x11]\n"
    "fmla v17.4s, v29.4s, v5.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v9.4s, v29.4s, v4.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v25.4s, v29.4s, v6.4s\n"
    "ldr q30, [x23, %[input_col_stride1]]\n"
    "fmla v14.4s, v30.4s, v1.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v9.4s, v23.4s, v5.4s\n"
    "ldr q23, [x22, x13]\n"
    "fmla v13.4s, v30.4s, v2.4s\n"
    "ldr q29, [x20, x19]\n"
    "fmla v16.4s, v23.4s, v0.4s\n"
    "prfm pldl1keep, [%[inptr0], x10]\n"
    "fmla v14.4s, v23.4s, v3.4s\n"
    "fmla v15.4s, v23.4s, v1.4s\n"
    "fmla v13.4s, v23.4s, v4.4s\n"
    "fmla v18.4s, v23.4s, v2.4s\n"
    "fmla v22.4s, v23.4s, v19.4s\n"
    "ldr q23, [x9, x17]\n"
    "fmla v7.4s, v29.4s, v0.4s\n"
    "fmla v15.4s, v29.4s, v3.4s\n"
    "fmla v17.4s, v29.4s, v1.4s\n"
    "fmla v13.4s, v29.4s, v5.4s\n"
    "fmla v18.4s, v29.4s, v4.4s\n"
    "fmla v25.4s, v29.4s, v2.4s\n"
    "fmla v22.4s, v29.4s, v6.4s\n"
    "fmla v24.4s, v29.4s, v19.4s\n"
    "fmla v21.4s, v29.4s, v20.4s\n"
    "ldr q26, [x24, x11]\n"
    "fmla v10.4s, v23.4s, v0.4s\n"
    "ldr q28, [x23, x13]\n"
    "fmla v17.4s, v23.4s, v3.4s\n"
    "add x24, x24, #16\n"
    "fmla v9.4s, v23.4s, v1.4s\n"
    "prfm pldl1keep, [x24, #64]\n"
    "fmla v18.4s, v23.4s, v5.4s\n"
    "prfm pldl1keep, [x24, x16]\n"
    "fmla v25.4s, v23.4s, v4.4s\n"
    "fmla v24.4s, v23.4s, v6.4s\n"
    "fmla v9.4s, v26.4s, v3.4s\n"
    "ldr q20, [x22, x19]\n"
    "fmla v14.4s, v28.4s, v0.4s\n"
    "fmla v13.4s, v28.4s, v1.4s\n"
    "fmla v25.4s, v26.4s, v5.4s\n"
    "ldr q26, [x20, x17]\n"
    "fmla v22.4s, v28.4s, v2.4s\n"
    "ldr q23, [x9, x11]\n"
    "fmla v15.4s, v20.4s, v0.4s\n"
    "add x9, x9, #16\n"
    "fmla v13.4s, v20.4s, v3.4s\n"
    "prfm pldl1keep, [x9, #64]\n"
    "fmla v18.4s, v20.4s, v1.4s\n"
    "prfm pldl1keep, [x9, x16]\n"
    "fmla v22.4s, v20.4s, v4.4s\n"
    "fmla v24.4s, v20.4s, v2.4s\n"
    "fmla v21.4s, v20.4s, v19.4s\n"
    "ldr q27, [x23, x19]\n"
    "fmla v17.4s, v26.4s, v0.4s\n"
    "ldr q20, [x22, x17]\n"
    "fmla v18.4s, v26.4s, v3.4s\n"
    "fmla v25.4s, v26.4s, v1.4s\n"
    "fmla v22.4s, v26.4s, v5.4s\n"
    "fmla v24.4s, v26.4s, v4.4s\n"
    "fmla v21.4s, v26.4s, v6.4s\n"
    "ldr q19, [x20, x11]\n"
    "fmla v9.4s, v23.4s, v0.4s\n"
    "ldr q28, [x23, x17]\n"
    "fmla v25.4s, v23.4s, v3.4s\n"
    "add x20, x20, #16\n"
    "fmla v24.4s, v23.4s, v5.4s\n"
    "ldr q29, [x22, x11]\n"
    "fmla v13.4s, v27.4s, v0.4s\n"
    "prfm pldl1keep, [x20, #64]\n"
    "fmla v22.4s, v27.4s, v1.4s\n"
    "add x22, x22, #16\n"
    "fmla v21.4s, v27.4s, v2.4s\n"
    "ldr q30, [x23, x11]\n"
    "fmla v18.4s, v20.4s, v0.4s\n"
    "ldr q23, [%[wbptr]]\n"
    "fmla v22.4s, v20.4s, v3.4s\n"
    "add x23, x23, #16\n"
    "fmla v24.4s, v20.4s, v1.4s\n"
    "fmla v21.4s, v20.4s, v4.4s\n"
    "fmla v25.4s, v19.4s, v0.4s\n"
    "ldr q20, [%[wbptr], #16]\n"
    "fmla v22.4s, v28.4s, v0.4s\n"
    "ldr q6, [%[wbptr], #32]\n"
    "fmla v21.4s, v19.4s, v5.4s\n"
    "movi v26.16b, #0\n"
    "fmla v24.4s, v19.4s, v3.4s\n"
    "ldr q19, [%[wbptr], #64]\n"
    "fmax v12.4s, v12.4s, v26.4s\n"
    "fmax v11.4s, v11.4s, v26.4s\n"
    "fmla v21.4s, v28.4s, v1.4s\n"
    "ldr q5, [%[wbptr], #48]\n"
    "fmla v24.4s, v29.4s, v0.4s\n"
    "ldr q4, [%[wbptr], #80]\n"
    "fmax v10.4s, v10.4s, v26.4s\n"
    "fmax v9.4s, v9.4s, v26.4s\n"
    "fmla v21.4s, v29.4s, v3.4s\n"
    "ldr q2, [%[wbptr], #112]\n"
    "fmov v27.4s, #6.0\n"
    "fmax v8.4s, v8.4s, v26.4s\n"
    "fmax v7.4s, v7.4s, v26.4s\n"
    "fmax v17.4s, v17.4s, v26.4s\n"
    "fmla v21.4s, v30.4s, v0.4s\n"
    "ldr q3, [%[wbptr], #96]\n"
    "fmin v12.4s, v12.4s, v27.4s\n"
    "ldr q1, [%[wbptr], #128]\n"
    "fmin v11.4s, v11.4s, v27.4s\n"
    "fmin v10.4s, v10.4s, v27.4s\n"
    "str q12, [%[outptr0]]\n"
    "fmin v9.4s, v9.4s, v27.4s\n"
    "str q11, [%[outptr0], %[output_col_stride1]]\n"
    "fmin v8.4s, v8.4s, v27.4s\n"
    "str q10, [%[outptr0], x27]\n"
    "fmin v7.4s, v7.4s, v27.4s\n"
    "str q9, [%[outptr0], x28]\n"
    "fmin v17.4s, v17.4s, v27.4s\n"
    "str q8, [x8]\n"
    "fmax v25.4s, v25.4s, v26.4s\n"
    "str q7, [x8, %[output_col_stride1]]\n"
    "fmax v16.4s, v16.4s, v26.4s\n"
    "str q17, [x8, x27]\n"
    "fmin v25.4s, v25.4s, v27.4s\n"
    "fmin v16.4s, v16.4s, v27.4s\n"
    "ldr q0, [%[wbptr], #144]\n"
    "str q25, [x8, x28]\n"
    "fmax v15.4s, v15.4s, v26.4s\n"
    "str q16, [x25]\n"
    "fmax v18.4s, v18.4s, v26.4s\n"
    "fmin v15.4s, v15.4s, v27.4s\n"
    "ldr q28, [%[inptr0]]\n"
    "fmin v18.4s, v18.4s, v27.4s\n"
    "ldr q25, [x24]\n"
    "str q15, [x25, %[output_col_stride1]]\n"
    "fmax v24.4s, v24.4s, v26.4s\n"
    "str q18, [x25, x27]\n"
    "fmax v14.4s, v14.4s, v26.4s\n"
    "fmin v24.4s, v24.4s, v27.4s\n"
    "ldr q18, [%[inptr0], %[input_col_stride1]]\n"
    "fmin v14.4s, v14.4s, v27.4s\n"
    "ldr q30, [x9]\n"
    "str q24, [x25, x28]\n"
    "fmax v13.4s, v13.4s, v26.4s\n"
    "str q14, [x26]\n"
    "fmax v22.4s, v22.4s, v26.4s\n"
    "fmin v13.4s, v13.4s, v27.4s\n"
    "ldr q29, [x24, %[input_col_stride1]]\n"
    "fmin v22.4s, v22.4s, v27.4s\n"
    "ldr q24, [%[inptr0], x13]\n"
    "str q13, [x26, %[output_col_stride1]]\n"
    "fmax v21.4s, v21.4s, v26.4s\n"
    "str q22, [x26, x27]\n"
    "mov v12.16b, v23.16b\n"
    "fmin v21.4s, v21.4s, v27.4s\n"
    "ldr q27, [x20]\n"
    "mov v8.16b, v23.16b\n"
    "ldr q22, [x9, %[input_col_stride1]]\n"
    "str q21, [x26, x28]\n"
    "mov v11.16b, v23.16b\n"
    "mov v16.16b, v23.16b\n"
    "add %[outptr0], %[outptr0], #16\n"
    "mov v7.16b, v23.16b\n"
    "add x8, x8, #16\n"
    "mov v10.16b, v23.16b\n"
    "add x25, x25, #16\n"
    "mov v14.16b, v23.16b\n"
    "add x26, x26, #16\n"
    "mov v15.16b, v23.16b\n"
    "mov v17.16b, v23.16b\n"
    "mov v9.16b, v23.16b\n"
    "fmla v12.4s, v28.4s, v20.4s\n"
    "fmla v8.4s, v25.4s, v20.4s\n"
    "fmla v11.4s, v18.4s, v20.4s\n"
    "fmla v16.4s, v30.4s, v20.4s\n"
    "fmla v12.4s, v25.4s, v19.4s\n"
    "fmla v8.4s, v30.4s, v19.4s\n"
    "fmla v12.4s, v18.4s, v6.4s\n"
    "fmla v8.4s, v29.4s, v6.4s\n"
    "fmla v12.4s, v30.4s, v2.4s\n"
    "fmla v12.4s, v29.4s, v4.4s\n"
    "bne 2b\n"
    "3:\n"
    "mov v13.16b, v23.16b\n"
    "ldr q21, [x24, x13]\n"
    "mov v18.16b, v23.16b\n"
    "prfm pldl1keep, [x24, x10]\n"
    "fmla v11.4s, v29.4s, v19.4s\n"
    "prfm pldl1keep, [%[inptr0], x21]\n"
    "fmla v7.4s, v29.4s, v20.4s\n"
    "ldr q25, [%[inptr0], x19]\n"
    "fmla v12.4s, v24.4s, v5.4s\n"
    "prfm pldl1keep, [x22, #64]\n"
    "fmla v11.4s, v24.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x16]\n"
    "fmla v10.4s, v24.4s, v20.4s\n"
    "ldr q24, [x22]\n"
    "fmla v8.4s, v27.4s, v2.4s\n"
    "prfm pldl1keep, [x9, x10]\n"
    "fmla v16.4s, v27.4s, v19.4s\n"
    "prfm pldl1keep, [x24, x21]\n"
    "fmla v14.4s, v27.4s, v20.4s\n"
    "ldr q26, [x20, %[input_col_stride1]]\n"
    "fmla v12.4s, v22.4s, v1.4s\n"
    "prfm pldl1keep, [%[inptr0], x7]\n"
    "fmla v8.4s, v22.4s, v4.4s\n"
    "prfm pldl1keep, [x23, #64]\n"
    "fmla v11.4s, v22.4s, v2.4s\n"
    "prfm pldl1keep, [x22, x16]\n"
    "fmla v16.4s, v22.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x10]\n"
    "fmla v7.4s, v22.4s, v19.4s\n"
    "prfm pldl1keep, [x9, x21]\n"
    "fmla v15.4s, v22.4s, v20.4s\n"
    "ldr q30, [x9, x13]\n"
    "fmla v12.4s, v21.4s, v3.4s\n"
    "prfm pldl1keep, [x24, x7]\n"
    "fmla v8.4s, v21.4s, v5.4s\n"
    "prfm pldl1keep, [%[inptr0], x12]\n"
    "fmla v11.4s, v21.4s, v4.4s\n"
    "prfm pldl1keep, [x23, x16]\n"
    "fmla v7.4s, v21.4s, v6.4s\n"
    "prfm pldl1keep, [x22, x10]\n"
    "fmla v10.4s, v21.4s, v19.4s\n"
    "prfm pldl1keep, [x20, x21]\n"
    "fmla v17.4s, v21.4s, v20.4s\n"
    "ldr q22, [x24, x19]\n"
    "fmla v11.4s, v25.4s, v5.4s\n"
    "prfm pldl1keep, [x9, x7]\n"
    "fmla v10.4s, v25.4s, v6.4s\n"
    "prfm pldl1keep, [x24, x12]\n"
    "fmla v9.4s, v25.4s, v20.4s\n"
    "ldr q21, [%[inptr0], x17]\n"
    "fmla v16.4s, v24.4s, v2.4s\n"
    "prfm pldl1keep, [x23, x10]\n"
    "fmla v14.4s, v24.4s, v19.4s\n"
    "ldr q24, [x23]\n"
    "fmla v8.4s, v26.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x21]\n"
    "fmla v16.4s, v26.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x7]\n"
    "fmla v7.4s, v26.4s, v2.4s\n"
    "prfm pldl1keep, [x9, x12]\n"
    "fmla v14.4s, v26.4s, v6.4s\n"
    "prfm pldl1keep, [x23, x21]\n"
    "fmla v15.4s, v26.4s, v19.4s\n"
    "prfm pldl1keep, [x22, x7]\n"
    "fmla v13.4s, v26.4s, v20.4s\n"
    "ldr q26, [x22, %[input_col_stride1]]\n"
    "fmla v12.4s, v30.4s, v0.4s\n"
    "prfm pldl1keep, [x20, x12]\n"
    "fmla v8.4s, v30.4s, v3.4s\n"
    "prfm pldl1keep, [x23, x7]\n"
    "fmla v11.4s, v30.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x12]\n"
    "fmla v16.4s, v30.4s, v5.4s\n"
    "prfm pldl1keep, [x23, x12]\n"
    "fmla v7.4s, v30.4s, v4.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v10.4s, v30.4s, v2.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v15.4s, v30.4s, v6.4s\n"
    "fmla v17.4s, v30.4s, v19.4s\n"
    "fmla v18.4s, v30.4s, v20.4s\n"
    "ldr q27, [x20, x13]\n"
    "fmla v11.4s, v22.4s, v3.4s\n"
    "fmla v7.4s, v22.4s, v5.4s\n"
    "fmla v10.4s, v22.4s, v4.4s\n"
    "fmla v17.4s, v22.4s, v6.4s\n"
    "fmla v9.4s, v22.4s, v19.4s\n"
    "fmla v14.4s, v24.4s, v2.4s\n"
    "mov v25.16b, v23.16b\n"
    "fmla v16.4s, v26.4s, v1.4s\n"
    "fmla v10.4s, v21.4s, v5.4s\n"
    "fmla v15.4s, v26.4s, v2.4s\n"
    "fmla v25.4s, v22.4s, v20.4s\n"
    "ldr q28, [x9, x19]\n"
    "fmla v9.4s, v21.4s, v6.4s\n"
    "ldr q29, [x24, x17]\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "fmla v13.4s, v26.4s, v19.4s\n"
    "mov v22.16b, v23.16b\n"
    "fmla v8.4s, v27.4s, v0.4s\n"
    "fmla v16.4s, v27.4s, v3.4s\n"
    "fmla v7.4s, v27.4s, v1.4s\n"
    "fmla v14.4s, v27.4s, v5.4s\n"
    "fmla v15.4s, v27.4s, v4.4s\n"
    "fmla v17.4s, v27.4s, v2.4s\n"
    "fmla v13.4s, v27.4s, v6.4s\n"
    "fmla v18.4s, v27.4s, v19.4s\n"
    "fmla v22.4s, v27.4s, v20.4s\n"
    "mov v24.16b, v23.16b\n"
    "mov v21.16b, v23.16b\n"
    "fmla v11.4s, v28.4s, v0.4s\n"
    "fmla v7.4s, v28.4s, v3.4s\n"
    "fmla v10.4s, v28.4s, v1.4s\n"
    "fmla v15.4s, v28.4s, v5.4s\n"
    "fmla v17.4s, v28.4s, v4.4s\n"
    "fmla v9.4s, v28.4s, v2.4s\n"
    "fmla v18.4s, v28.4s, v6.4s\n"
    "fmla v25.4s, v28.4s, v19.4s\n"
    "fmla v24.4s, v28.4s, v20.4s\n"
    "ldr q23, [%[inptr0], x11]\n"
    "fmla v10.4s, v29.4s, v3.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v17.4s, v29.4s, v5.4s\n"
    "fmla v9.4s, v29.4s, v4.4s\n"
    "fmla v25.4s, v29.4s, v6.4s\n"
    "ldr q30, [x23, %[input_col_stride1]]\n"
    "fmla v14.4s, v30.4s, v1.4s\n"
    "fmla v13.4s, v30.4s, v2.4s\n"
    "fmla v9.4s, v23.4s, v5.4s\n"
    "ldr q23, [x22, x13]\n"
    "fmla v16.4s, v23.4s, v0.4s\n"
    "ldr q29, [x20, x19]\n"
    "fmla v14.4s, v23.4s, v3.4s\n"
    "fmla v15.4s, v23.4s, v1.4s\n"
    "fmla v13.4s, v23.4s, v4.4s\n"
    "fmla v18.4s, v23.4s, v2.4s\n"
    "fmla v22.4s, v23.4s, v19.4s\n"
    "ldr q23, [x9, x17]\n"
    "fmla v7.4s, v29.4s, v0.4s\n"
    "fmla v15.4s, v29.4s, v3.4s\n"
    "fmla v17.4s, v29.4s, v1.4s\n"
    "fmla v13.4s, v29.4s, v5.4s\n"
    "fmla v18.4s, v29.4s, v4.4s\n"
    "fmla v25.4s, v29.4s, v2.4s\n"
    "fmla v22.4s, v29.4s, v6.4s\n"
    "fmla v24.4s, v29.4s, v19.4s\n"
    "fmla v21.4s, v29.4s, v20.4s\n"
    "ldr q26, [x24, x11]\n"
    "fmla v10.4s, v23.4s, v0.4s\n"
    "ldr q28, [x23, x13]\n"
    "fmla v17.4s, v23.4s, v3.4s\n"
    "add x24, x24, #16\n"
    "fmla v9.4s, v23.4s, v1.4s\n"
    "fmla v18.4s, v23.4s, v5.4s\n"
    "fmla v25.4s, v23.4s, v4.4s\n"
    "fmla v24.4s, v23.4s, v6.4s\n"
    "fmla v14.4s, v28.4s, v0.4s\n"
    "ldr q20, [x22, x19]\n"
    "fmla v9.4s, v26.4s, v3.4s\n"
    "fmla v13.4s, v28.4s, v1.4s\n"
    "fmla v25.4s, v26.4s, v5.4s\n"
    "ldr q26, [x20, x17]\n"
    "fmla v22.4s, v28.4s, v2.4s\n"
    "ldr q23, [x9, x11]\n"
    "fmla v15.4s, v20.4s, v0.4s\n"
    "add x9, x9, #16\n"
    "fmla v13.4s, v20.4s, v3.4s\n"
    "fmla v18.4s, v20.4s, v1.4s\n"
    "fmla v22.4s, v20.4s, v4.4s\n"
    "fmla v24.4s, v20.4s, v2.4s\n"
    "fmla v21.4s, v20.4s, v19.4s\n"
    "ldr q27, [x23, x19]\n"
    "fmla v17.4s, v26.4s, v0.4s\n"
    "ldr q20, [x22, x17]\n"
    "fmla v18.4s, v26.4s, v3.4s\n"
    "fmla v25.4s, v26.4s, v1.4s\n"
    "fmla v22.4s, v26.4s, v5.4s\n"
    "fmla v24.4s, v26.4s, v4.4s\n"
    "fmla v21.4s, v26.4s, v6.4s\n"
    "ldr q19, [x20, x11]\n"
    "fmla v9.4s, v23.4s, v0.4s\n"
    "ldr q28, [x23, x17]\n"
    "fmla v25.4s, v23.4s, v3.4s\n"
    "add x20, x20, #16\n"
    "fmla v24.4s, v23.4s, v5.4s\n"
    "ldr q29, [x22, x11]\n"
    "fmla v13.4s, v27.4s, v0.4s\n"
    "add x22, x22, #16\n"
    "fmla v22.4s, v27.4s, v1.4s\n"
    "fmla v21.4s, v27.4s, v2.4s\n"
    "fmla v18.4s, v20.4s, v0.4s\n"
    "ldr q30, [x23, x11]\n"
    "fmla v24.4s, v20.4s, v1.4s\n"
    "add x23, x23, #16\n"
    "fmla v22.4s, v20.4s, v3.4s\n"
    "fmla v21.4s, v20.4s, v4.4s\n"
    "fmla v25.4s, v19.4s, v0.4s\n"
    "movi v26.16b, #0\n"
    "fmla v24.4s, v19.4s, v3.4s\n"
    "fmov v27.4s, #6.0\n"
    "fmla v21.4s, v19.4s, v5.4s\n"
    "fmla v22.4s, v28.4s, v0.4s\n"
    "fmax v12.4s, v12.4s, v26.4s\n"
    "fmax v11.4s, v11.4s, v26.4s\n"
    "fmla v24.4s, v29.4s, v0.4s\n"
    "fmax v10.4s, v10.4s, v26.4s\n"
    "fmla v21.4s, v28.4s, v1.4s\n"
    "fmin v12.4s, v12.4s, v27.4s\n"
    "fmin v11.4s, v11.4s, v27.4s\n"
    "fmin v10.4s, v10.4s, v27.4s\n"
    "str q12, [%[outptr0]]\n"
    "fmax v9.4s, v9.4s, v26.4s\n"
    "str q11, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v21.4s, v29.4s, v3.4s\n"
    "str q10, [%[outptr0], x27]\n"
    "fmin v9.4s, v9.4s, v27.4s\n"
    "fmax v8.4s, v8.4s, v26.4s\n"
    "fmax v7.4s, v7.4s, v26.4s\n"
    "str q9, [%[outptr0], x28]\n"
    "fmla v21.4s, v30.4s, v0.4s\n"
    "fmin v8.4s, v8.4s, v27.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmin v7.4s, v7.4s, v27.4s\n"
    "fmax v17.4s, v17.4s, v26.4s\n"
    "str q8, [x8]\n"
    "fmax v25.4s, v25.4s, v26.4s\n"
    "str q7, [x8, %[output_col_stride1]]\n"
    "fmin v17.4s, v17.4s, v27.4s\n"
    "fmin v25.4s, v25.4s, v27.4s\n"
    "fmax v16.4s, v16.4s, v26.4s\n"
    "str q17, [x8, x27]\n"
    "fmax v15.4s, v15.4s, v26.4s\n"
    "str q25, [x8, x28]\n"
    "fmin v16.4s, v16.4s, v27.4s\n"
    "fmin v15.4s, v15.4s, v27.4s\n"
    "add x8, x8, #16\n"
    "str q16, [x25]\n"
    "fmax v18.4s, v18.4s, v26.4s\n"
    "str q15, [x25, %[output_col_stride1]]\n"
    "fmax v24.4s, v24.4s, v26.4s\n"
    "fmin v18.4s, v18.4s, v27.4s\n"
    "fmax v14.4s, v14.4s, v26.4s\n"
    "fmin v24.4s, v24.4s, v27.4s\n"
    "fmax v13.4s, v13.4s, v26.4s\n"
    "str q18, [x25, x27]\n"
    "fmin v14.4s, v14.4s, v27.4s\n"
    "str q24, [x25, x28]\n"
    "fmin v13.4s, v13.4s, v27.4s\n"
    "str q14, [x26]\n"
    "fmax v22.4s, v22.4s, v26.4s\n"
    "str q13, [x26, %[output_col_stride1]]\n"
    "fmax v21.4s, v21.4s, v26.4s\n"
    "fmin v22.4s, v22.4s, v27.4s\n"
    "add x25, x25, #16\n"
    "fmin v21.4s, v21.4s, v27.4s\n"
    "str q22, [x26, x27]\n"
    "str q21, [x26, x28]\n"
    "add x26, x26, #16\n"
    "4:\n"
    "cbz x14, 7f\n"
    "ldr s23, [%[wbptr]]\n"
    "mov v12.16b, v23.16b\n"
    "ldr s20, [%[wbptr], #4]\n"
    "mov v8.16b, v23.16b\n"
    "ldr s6, [%[wbptr], #8]\n"
    "mov v11.16b, v23.16b\n"
    "ldr s5, [%[wbptr], #12]\n"
    "mov v16.16b, v23.16b\n"
    "ldr s19, [%[wbptr], #16]\n"
    "mov v7.16b, v23.16b\n"
    "ldr s4, [%[wbptr], #20]\n"
    "mov v10.16b, v23.16b\n"
    "ldr s3, [%[wbptr], #24]\n"
    "mov v14.16b, v23.16b\n"
    "ldr s2, [%[wbptr], #28]\n"
    "mov v15.16b, v23.16b\n"
    "ldr s1, [%[wbptr], #32]\n"
    "mov v17.16b, v23.16b\n"
    "ldr s0, [%[wbptr], #36]\n"
    "mov v9.16b, v23.16b\n"
    "ldr s28, [%[inptr0]]\n"
    "fmla v12.4s, v28.4s, v20.4s\n"
    "ldr s25, [x24]\n"
    "fmla v8.4s, v25.4s, v20.4s\n"
    "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v11.4s, v18.4s, v20.4s\n"
    "ldr s30, [x9]\n"
    "fmla v12.4s, v25.4s, v19.4s\n"
    "ldr s29, [x24, %[input_col_stride1]]\n"
    "fmla v8.4s, v30.4s, v19.4s\n"
    "ldr s24, [%[inptr0], x13]\n"
    "fmla v16.4s, v30.4s, v20.4s\n"
    "ldr s27, [x20]\n"
    "fmla v12.4s, v18.4s, v6.4s\n"
    "ldr s22, [x9, %[input_col_stride1]]\n"
    "fmla v8.4s, v29.4s, v6.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "prfm pldl1keep, [x24, #64]\n"
    "subs x14, x14, #1\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "prfm pldl1keep, [x9, #64]\n"
    "fmla v12.4s, v30.4s, v2.4s\n"
    "prfm pldl1keep, [x24, x16]\n"
    "prfm pldl1keep, [%[inptr0], x10]\n"
    "prfm pldl1keep, [x20, #64]\n"
    "prfm pldl1keep, [x9, x16]\n"
    "fmla v12.4s, v29.4s, v4.4s\n"
    "beq 6f\n"
    "5:\n"
    "mov v13.16b, v23.16b\n"
    "ldr s21, [x24, x13]\n"
    "mov v18.16b, v23.16b\n"
    "prfm pldl1keep, [x24, x10]\n"
    "fmla v11.4s, v29.4s, v19.4s\n"
    "prfm pldl1keep, [%[inptr0], x21]\n"
    "fmla v7.4s, v29.4s, v20.4s\n"
    "ldr s25, [%[inptr0], x19]\n"
    "fmla v12.4s, v24.4s, v5.4s\n"
    "prfm pldl1keep, [x22, #64]\n"
    "fmla v11.4s, v24.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x16]\n"
    "fmla v10.4s, v24.4s, v20.4s\n"
    "ldr s24, [x22]\n"
    "fmla v8.4s, v27.4s, v2.4s\n"
    "prfm pldl1keep, [x9, x10]\n"
    "fmla v16.4s, v27.4s, v19.4s\n"
    "prfm pldl1keep, [x24, x21]\n"
    "fmla v14.4s, v27.4s, v20.4s\n"
    "ldr s26, [x20, %[input_col_stride1]]\n"
    "fmla v12.4s, v22.4s, v1.4s\n"
    "prfm pldl1keep, [%[inptr0], x7]\n"
    "fmla v8.4s, v22.4s, v4.4s\n"
    "prfm pldl1keep, [x23, #64]\n"
    "fmla v11.4s, v22.4s, v2.4s\n"
    "prfm pldl1keep, [x22, x16]\n"
    "fmla v16.4s, v22.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x10]\n"
    "fmla v7.4s, v22.4s, v19.4s\n"
    "prfm pldl1keep, [x9, x21]\n"
    "fmla v15.4s, v22.4s, v20.4s\n"
    "ldr s30, [x9, x13]\n"
    "fmla v12.4s, v21.4s, v3.4s\n"
    "prfm pldl1keep, [x24, x7]\n"
    "fmla v8.4s, v21.4s, v5.4s\n"
    "prfm pldl1keep, [%[inptr0], x12]\n"
    "fmla v11.4s, v21.4s, v4.4s\n"
    "prfm pldl1keep, [x23, x16]\n"
    "fmla v7.4s, v21.4s, v6.4s\n"
    "prfm pldl1keep, [x22, x10]\n"
    "fmla v10.4s, v21.4s, v19.4s\n"
    "prfm pldl1keep, [x20, x21]\n"
    "fmla v17.4s, v21.4s, v20.4s\n"
    "ldr s22, [x24, x19]\n"
    "fmla v11.4s, v25.4s, v5.4s\n"
    "prfm pldl1keep, [x9, x7]\n"
    "fmla v10.4s, v25.4s, v6.4s\n"
    "prfm pldl1keep, [x24, x12]\n"
    "fmla v9.4s, v25.4s, v20.4s\n"
    "ldr s21, [%[inptr0], x17]\n"
    "fmla v16.4s, v24.4s, v2.4s\n"
    "prfm pldl1keep, [x23, x10]\n"
    "fmla v14.4s, v24.4s, v19.4s\n"
    "ldr s24, [x23]\n"
    "fmla v8.4s, v26.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x21]\n"
    "fmla v16.4s, v26.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x7]\n"
    "fmla v7.4s, v26.4s, v2.4s\n"
    "prfm pldl1keep, [x9, x12]\n"
    "fmla v14.4s, v26.4s, v6.4s\n"
    "prfm pldl1keep, [x23, x21]\n"
    "fmla v15.4s, v26.4s, v19.4s\n"
    "prfm pldl1keep, [x22, x7]\n"
    "fmla v13.4s, v26.4s, v20.4s\n"
    "ldr s26, [x22, %[input_col_stride1]]\n"
    "fmla v12.4s, v30.4s, v0.4s\n"
    "prfm pldl1keep, [x20, x12]\n"
    "fmla v8.4s, v30.4s, v3.4s\n"
    "prfm pldl1keep, [x23, x7]\n"
    "fmla v11.4s, v30.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x12]\n"
    "fmla v16.4s, v30.4s, v5.4s\n"
    "prfm pldl1keep, [x23, x12]\n"
    "fmla v7.4s, v30.4s, v4.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v10.4s, v30.4s, v2.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v15.4s, v30.4s, v6.4s\n"
    "subs x14, x14, #1\n"
    "fmla v17.4s, v30.4s, v19.4s\n"
    "fmla v18.4s, v30.4s, v20.4s\n"
    "mov v25.16b, v23.16b\n"
    "fmla v11.4s, v22.4s, v3.4s\n"
    "fmla v7.4s, v22.4s, v5.4s\n"
    "fmla v10.4s, v22.4s, v4.4s\n"
    "fmla v17.4s, v22.4s, v6.4s\n"
    "fmla v9.4s, v22.4s, v19.4s\n"
    "fmla v25.4s, v22.4s, v20.4s\n"
    "ldr s27, [x20, x13]\n"
    "fmla v10.4s, v21.4s, v5.4s\n"
    "fmla v14.4s, v24.4s, v2.4s\n"
    "mov v22.16b, v23.16b\n"
    "fmla v9.4s, v21.4s, v6.4s\n"
    "mov v24.16b, v23.16b\n"
    "mov v21.16b, v23.16b\n"
    "fmla v16.4s, v26.4s, v1.4s\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "fmla v15.4s, v26.4s, v2.4s\n"
    "fmla v13.4s, v26.4s, v19.4s\n"
    "fmla v8.4s, v27.4s, v0.4s\n"
    "ldr s28, [x9, x19]\n"
    "fmla v16.4s, v27.4s, v3.4s\n"
    "fmla v7.4s, v27.4s, v1.4s\n"
    "fmla v14.4s, v27.4s, v5.4s\n"
    "fmla v15.4s, v27.4s, v4.4s\n"
    "fmla v17.4s, v27.4s, v2.4s\n"
    "fmla v13.4s, v27.4s, v6.4s\n"
    "fmla v18.4s, v27.4s, v19.4s\n"
    "fmla v22.4s, v27.4s, v20.4s\n"
    "fmla v11.4s, v28.4s, v0.4s\n"
    "ldr s29, [x24, x17]\n"
    "fmla v7.4s, v28.4s, v3.4s\n"
    "fmla v10.4s, v28.4s, v1.4s\n"
    "fmla v15.4s, v28.4s, v5.4s\n"
    "fmla v17.4s, v28.4s, v4.4s\n"
    "fmla v9.4s, v28.4s, v2.4s\n"
    "fmla v18.4s, v28.4s, v6.4s\n"
    "fmla v25.4s, v28.4s, v19.4s\n"
    "fmla v24.4s, v28.4s, v20.4s\n"
    "fmla v10.4s, v29.4s, v3.4s\n"
    "ldr s23, [%[inptr0], x11]\n"
    "fmla v17.4s, v29.4s, v5.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v9.4s, v29.4s, v4.4s\n"
    "prfm pldl1keep, [%[inptr0], #64]\n"
    "fmla v25.4s, v29.4s, v6.4s\n"
    "ldr s30, [x23, %[input_col_stride1]]\n"
    "fmla v14.4s, v30.4s, v1.4s\n"
    "prfm pldl1keep, [%[inptr0], x16]\n"
    "fmla v9.4s, v23.4s, v5.4s\n"
    "ldr s23, [x22, x13]\n"
    "fmla v13.4s, v30.4s, v2.4s\n"
    "ldr s29, [x20, x19]\n"
    "fmla v16.4s, v23.4s, v0.4s\n"
    "prfm pldl1keep, [%[inptr0], x10]\n"
    "fmla v14.4s, v23.4s, v3.4s\n"
    "fmla v15.4s, v23.4s, v1.4s\n"
    "fmla v13.4s, v23.4s, v4.4s\n"
    "fmla v18.4s, v23.4s, v2.4s\n"
    "fmla v22.4s, v23.4s, v19.4s\n"
    "ldr s23, [x9, x17]\n"
    "fmla v7.4s, v29.4s, v0.4s\n"
    "fmla v15.4s, v29.4s, v3.4s\n"
    "fmla v17.4s, v29.4s, v1.4s\n"
    "fmla v13.4s, v29.4s, v5.4s\n"
    "fmla v18.4s, v29.4s, v4.4s\n"
    "fmla v25.4s, v29.4s, v2.4s\n"
    "fmla v22.4s, v29.4s, v6.4s\n"
    "fmla v24.4s, v29.4s, v19.4s\n"
    "fmla v21.4s, v29.4s, v20.4s\n"
    "ldr s26, [x24, x11]\n"
    "fmla v10.4s, v23.4s, v0.4s\n"
    "ldr s28, [x23, x13]\n"
    "fmla v17.4s, v23.4s, v3.4s\n"
    "add x24, x24, #4\n"
    "fmla v9.4s, v23.4s, v1.4s\n"
    "prfm pldl1keep, [x24, #64]\n"
    "fmla v18.4s, v23.4s, v5.4s\n"
    "prfm pldl1keep, [x24, x16]\n"
    "fmla v25.4s, v23.4s, v4.4s\n"
    "fmla v24.4s, v23.4s, v6.4s\n"
    "fmla v9.4s, v26.4s, v3.4s\n"
    "ldr s20, [x22, x19]\n"
    "fmla v14.4s, v28.4s, v0.4s\n"
    "fmla v13.4s, v28.4s, v1.4s\n"
    "fmla v25.4s, v26.4s, v5.4s\n"
    "ldr s26, [x20, x17]\n"
    "fmla v22.4s, v28.4s, v2.4s\n"
    "ldr s23, [x9, x11]\n"
    "fmla v15.4s, v20.4s, v0.4s\n"
    "add x9, x9, #4\n"
    "fmla v13.4s, v20.4s, v3.4s\n"
    "prfm pldl1keep, [x9, #64]\n"
    "fmla v18.4s, v20.4s, v1.4s\n"
    "prfm pldl1keep, [x9, x16]\n"
    "fmla v22.4s, v20.4s, v4.4s\n"
    "fmla v24.4s, v20.4s, v2.4s\n"
    "fmla v21.4s, v20.4s, v19.4s\n"
    "ldr s27, [x23, x19]\n"
    "fmla v17.4s, v26.4s, v0.4s\n"
    "ldr s20, [x22, x17]\n"
    "fmla v18.4s, v26.4s, v3.4s\n"
    "fmla v25.4s, v26.4s, v1.4s\n"
    "fmla v22.4s, v26.4s, v5.4s\n"
    "fmla v24.4s, v26.4s, v4.4s\n"
    "fmla v21.4s, v26.4s, v6.4s\n"
    "ldr s19, [x20, x11]\n"
    "fmla v9.4s, v23.4s, v0.4s\n"
    "ldr s28, [x23, x17]\n"
    "fmla v25.4s, v23.4s, v3.4s\n"
    "add x20, x20, #4\n"
    "fmla v24.4s, v23.4s, v5.4s\n"
    "ldr s29, [x22, x11]\n"
    "fmla v13.4s, v27.4s, v0.4s\n"
    "prfm pldl1keep, [x20, #64]\n"
    "fmla v22.4s, v27.4s, v1.4s\n"
    "add x22, x22, #4\n"
    "fmla v21.4s, v27.4s, v2.4s\n"
    "ldr s30, [x23, x11]\n"
    "fmla v18.4s, v20.4s, v0.4s\n"
    "ldr s23, [%[wbptr]]\n"
    "fmla v22.4s, v20.4s, v3.4s\n"
    "add x23, x23, #4\n"
    "fmla v24.4s, v20.4s, v1.4s\n"
    "fmla v21.4s, v20.4s, v4.4s\n"
    "fmla v25.4s, v19.4s, v0.4s\n"
    "ldr s20, [%[wbptr], #4]\n"
    "fmla v22.4s, v28.4s, v0.4s\n"
    "ldr s6, [%[wbptr], #8]\n"
    "fmla v21.4s, v19.4s, v5.4s\n"
    "movi v26.16b, #0\n"
    "fmla v24.4s, v19.4s, v3.4s\n"
    "ldr s19, [%[wbptr], #16]\n"
    "fmax v12.4s, v12.4s, v26.4s\n"
    "fmax v11.4s, v11.4s, v26.4s\n"
    "fmla v21.4s, v28.4s, v1.4s\n"
    "ldr s5, [%[wbptr], #12]\n"
    "fmla v24.4s, v29.4s, v0.4s\n"
    "ldr s4, [%[wbptr], #20]\n"
    "fmax v10.4s, v10.4s, v26.4s\n"
    "fmax v9.4s, v9.4s, v26.4s\n"
    "fmla v21.4s, v29.4s, v3.4s\n"
    "ldr s2, [%[wbptr], #28]\n"
    "fmov v27.4s, #6.0\n"
    "fmax v8.4s, v8.4s, v26.4s\n"
    "fmax v7.4s, v7.4s, v26.4s\n"
    "fmax v17.4s, v17.4s, v26.4s\n"
    "fmla v21.4s, v30.4s, v0.4s\n"
    "ldr s3, [%[wbptr], #24]\n"
    "fmin v12.4s, v12.4s, v27.4s\n"
    "ldr s1, [%[wbptr], #32]\n"
    "fmin v11.4s, v11.4s, v27.4s\n"
    "fmin v10.4s, v10.4s, v27.4s\n"
    "str s12, [%[outptr0]]\n"
    "fmin v9.4s, v9.4s, v27.4s\n"
    "str s11, [%[outptr0], %[output_col_stride1]]\n"
    "fmin v8.4s, v8.4s, v27.4s\n"
    "str s10, [%[outptr0], x27]\n"
    "fmin v7.4s, v7.4s, v27.4s\n"
    "str s9, [%[outptr0], x28]\n"
    "fmin v17.4s, v17.4s, v27.4s\n"
    "str s8, [x8]\n"
    "fmax v25.4s, v25.4s, v26.4s\n"
    "str s7, [x8, %[output_col_stride1]]\n"
    "fmax v16.4s, v16.4s, v26.4s\n"
    "str s17, [x8, x27]\n"
    "fmin v25.4s, v25.4s, v27.4s\n"
    "fmin v16.4s, v16.4s, v27.4s\n"
    "ldr s0, [%[wbptr], #36]\n"
    "str s25, [x8, x28]\n"
    "fmax v15.4s, v15.4s, v26.4s\n"
    "str s16, [x25]\n"
    "fmax v18.4s, v18.4s, v26.4s\n"
    "fmin v15.4s, v15.4s, v27.4s\n"
    "ldr s28, [%[inptr0]]\n"
    "fmin v18.4s, v18.4s, v27.4s\n"
    "ldr s25, [x24]\n"
    "str s15, [x25, %[output_col_stride1]]\n"
    "fmax v24.4s, v24.4s, v26.4s\n"
    "str s18, [x25, x27]\n"
    "fmax v14.4s, v14.4s, v26.4s\n"
    "fmin v24.4s, v24.4s, v27.4s\n"
    "ldr s18, [%[inptr0], %[input_col_stride1]]\n"
    "fmin v14.4s, v14.4s, v27.4s\n"
    "ldr s30, [x9]\n"
    "str s24, [x25, x28]\n"
    "fmax v13.4s, v13.4s, v26.4s\n"
    "str s14, [x26]\n"
    "fmax v22.4s, v22.4s, v26.4s\n"
    "fmin v13.4s, v13.4s, v27.4s\n"
    "ldr s29, [x24, %[input_col_stride1]]\n"
    "fmin v22.4s, v22.4s, v27.4s\n"
    "ldr s24, [%[inptr0], x13]\n"
    "str s13, [x26, %[output_col_stride1]]\n"
    "fmax v21.4s, v21.4s, v26.4s\n"
    "str s22, [x26, x27]\n"
    "mov v12.16b, v23.16b\n"
    "fmin v21.4s, v21.4s, v27.4s\n"
    "ldr s27, [x20]\n"
    "mov v8.16b, v23.16b\n"
    "ldr s22, [x9, %[input_col_stride1]]\n"
    "str s21, [x26, x28]\n"
    "mov v11.16b, v23.16b\n"
    "mov v16.16b, v23.16b\n"
    "add %[outptr0], %[outptr0], #4\n"
    "mov v7.16b, v23.16b\n"
    "add x8, x8, #4\n"
    "mov v10.16b, v23.16b\n"
    "add x25, x25, #4\n"
    "mov v14.16b, v23.16b\n"
    "add x26, x26, #4\n"
    "mov v15.16b, v23.16b\n"
    "mov v17.16b, v23.16b\n"
    "mov v9.16b, v23.16b\n"
    "fmla v12.4s, v28.4s, v20.4s\n"
    "fmla v8.4s, v25.4s, v20.4s\n"
    "fmla v11.4s, v18.4s, v20.4s\n"
    "fmla v16.4s, v30.4s, v20.4s\n"
    "fmla v12.4s, v25.4s, v19.4s\n"
    "fmla v8.4s, v30.4s, v19.4s\n"
    "fmla v12.4s, v18.4s, v6.4s\n"
    "fmla v8.4s, v29.4s, v6.4s\n"
    "fmla v12.4s, v30.4s, v2.4s\n"
    "fmla v12.4s, v29.4s, v4.4s\n"
    "bne 5b\n"
    "6:\n"
    "mov v13.16b, v23.16b\n"
    "ldr s21, [x24, x13]\n"
    "mov v18.16b, v23.16b\n"
    "prfm pldl1keep, [x24, x10]\n"
    "fmla v11.4s, v29.4s, v19.4s\n"
    "prfm pldl1keep, [%[inptr0], x21]\n"
    "fmla v7.4s, v29.4s, v20.4s\n"
    "ldr s25, [%[inptr0], x19]\n"
    "fmla v12.4s, v24.4s, v5.4s\n"
    "prfm pldl1keep, [x22, #64]\n"
    "fmla v11.4s, v24.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x16]\n"
    "fmla v10.4s, v24.4s, v20.4s\n"
    "ldr s24, [x22]\n"
    "fmla v8.4s, v27.4s, v2.4s\n"
    "prfm pldl1keep, [x9, x10]\n"
    "fmla v16.4s, v27.4s, v19.4s\n"
    "prfm pldl1keep, [x24, x21]\n"
    "fmla v14.4s, v27.4s, v20.4s\n"
    "ldr s26, [x20, %[input_col_stride1]]\n"
    "fmla v12.4s, v22.4s, v1.4s\n"
    "prfm pldl1keep, [%[inptr0], x7]\n"
    "fmla v8.4s, v22.4s, v4.4s\n"
    "prfm pldl1keep, [x23, #64]\n"
    "fmla v11.4s, v22.4s, v2.4s\n"
    "prfm pldl1keep, [x22, x16]\n"
    "fmla v16.4s, v22.4s, v6.4s\n"
    "prfm pldl1keep, [x20, x10]\n"
    "fmla v7.4s, v22.4s, v19.4s\n"
    "prfm pldl1keep, [x9, x21]\n"
    "fmla v15.4s, v22.4s, v20.4s\n"
    "ldr s30, [x9, x13]\n"
    "fmla v12.4s, v21.4s, v3.4s\n"
    "prfm pldl1keep, [x24, x7]\n"
    "fmla v8.4s, v21.4s, v5.4s\n"
    "prfm pldl1keep, [%[inptr0], x12]\n"
    "fmla v11.4s, v21.4s, v4.4s\n"
    "prfm pldl1keep, [x23, x16]\n"
    "fmla v7.4s, v21.4s, v6.4s\n"
    "prfm pldl1keep, [x22, x10]\n"
    "fmla v10.4s, v21.4s, v19.4s\n"
    "prfm pldl1keep, [x20, x21]\n"
    "fmla v17.4s, v21.4s, v20.4s\n"
    "ldr s22, [x24, x19]\n"
    "fmla v11.4s, v25.4s, v5.4s\n"
    "prfm pldl1keep, [x9, x7]\n"
    "fmla v10.4s, v25.4s, v6.4s\n"
    "prfm pldl1keep, [x24, x12]\n"
    "fmla v9.4s, v25.4s, v20.4s\n"
    "ldr s21, [%[inptr0], x17]\n"
    "fmla v16.4s, v24.4s, v2.4s\n"
    "prfm pldl1keep, [x23, x10]\n"
    "fmla v14.4s, v24.4s, v19.4s\n"
    "ldr s24, [x23]\n"
    "fmla v8.4s, v26.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x21]\n"
    "fmla v16.4s, v26.4s, v4.4s\n"
    "prfm pldl1keep, [x20, x7]\n"
    "fmla v7.4s, v26.4s, v2.4s\n"
    "prfm pldl1keep, [x9, x12]\n"
    "fmla v14.4s, v26.4s, v6.4s\n"
    "prfm pldl1keep, [x23, x21]\n"
    "fmla v15.4s, v26.4s, v19.4s\n"
    "prfm pldl1keep, [x22, x7]\n"
    "fmla v13.4s, v26.4s, v20.4s\n"
    "ldr s26, [x22, %[input_col_stride1]]\n"
    "fmla v12.4s, v30.4s, v0.4s\n"
    "prfm pldl1keep, [x20, x12]\n"
    "fmla v8.4s, v30.4s, v3.4s\n"
    "prfm pldl1keep, [x23, x7]\n"
    "fmla v11.4s, v30.4s, v1.4s\n"
    "prfm pldl1keep, [x22, x12]\n"
    "fmla v16.4s, v30.4s, v5.4s\n"
    "prfm pldl1keep, [x23, x12]\n"
    "fmla v7.4s, v30.4s, v4.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v10.4s, v30.4s, v2.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v15.4s, v30.4s, v6.4s\n"
    "fmla v17.4s, v30.4s, v19.4s\n"
    "fmla v18.4s, v30.4s, v20.4s\n"
    "ldr s27, [x20, x13]\n"
    "fmla v11.4s, v22.4s, v3.4s\n"
    "fmla v7.4s, v22.4s, v5.4s\n"
    "fmla v10.4s, v22.4s, v4.4s\n"
    "fmla v17.4s, v22.4s, v6.4s\n"
    "fmla v9.4s, v22.4s, v19.4s\n"
    "fmla v14.4s, v24.4s, v2.4s\n"
    "mov v25.16b, v23.16b\n"
    "fmla v16.4s, v26.4s, v1.4s\n"
    "fmla v10.4s, v21.4s, v5.4s\n"
    "fmla v15.4s, v26.4s, v2.4s\n"
    "fmla v25.4s, v22.4s, v20.4s\n"
    "ldr s28, [x9, x19]\n"
    "fmla v9.4s, v21.4s, v6.4s\n"
    "ldr s29, [x24, x17]\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "fmla v13.4s, v26.4s, v19.4s\n"
    "mov v22.16b, v23.16b\n"
    "fmla v8.4s, v27.4s, v0.4s\n"
    "fmla v16.4s, v27.4s, v3.4s\n"
    "fmla v7.4s, v27.4s, v1.4s\n"
    "fmla v14.4s, v27.4s, v5.4s\n"
    "fmla v15.4s, v27.4s, v4.4s\n"
    "fmla v17.4s, v27.4s, v2.4s\n"
    "fmla v13.4s, v27.4s, v6.4s\n"
    "fmla v18.4s, v27.4s, v19.4s\n"
    "fmla v22.4s, v27.4s, v20.4s\n"
    "mov v24.16b, v23.16b\n"
    "mov v21.16b, v23.16b\n"
    "fmla v11.4s, v28.4s, v0.4s\n"
    "fmla v7.4s, v28.4s, v3.4s\n"
    "fmla v10.4s, v28.4s, v1.4s\n"
    "fmla v15.4s, v28.4s, v5.4s\n"
    "fmla v17.4s, v28.4s, v4.4s\n"
    "fmla v9.4s, v28.4s, v2.4s\n"
    "fmla v18.4s, v28.4s, v6.4s\n"
    "fmla v25.4s, v28.4s, v19.4s\n"
    "fmla v24.4s, v28.4s, v20.4s\n"
    "ldr s23, [%[inptr0], x11]\n"
    "fmla v10.4s, v29.4s, v3.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v17.4s, v29.4s, v5.4s\n"
    "fmla v9.4s, v29.4s, v4.4s\n"
    "fmla v25.4s, v29.4s, v6.4s\n"
    "ldr s30, [x23, %[input_col_stride1]]\n"
    "fmla v14.4s, v30.4s, v1.4s\n"
    "fmla v13.4s, v30.4s, v2.4s\n"
    "fmla v9.4s, v23.4s, v5.4s\n"
    "ldr s23, [x22, x13]\n"
    "fmla v16.4s, v23.4s, v0.4s\n"
    "ldr s29, [x20, x19]\n"
    "fmla v14.4s, v23.4s, v3.4s\n"
    "fmla v15.4s, v23.4s, v1.4s\n"
    "fmla v13.4s, v23.4s, v4.4s\n"
    "fmla v18.4s, v23.4s, v2.4s\n"
    "fmla v22.4s, v23.4s, v19.4s\n"
    "ldr s23, [x9, x17]\n"
    "fmla v7.4s, v29.4s, v0.4s\n"
    "fmla v15.4s, v29.4s, v3.4s\n"
    "fmla v17.4s, v29.4s, v1.4s\n"
    "fmla v13.4s, v29.4s, v5.4s\n"
    "fmla v18.4s, v29.4s, v4.4s\n"
    "fmla v25.4s, v29.4s, v2.4s\n"
    "fmla v22.4s, v29.4s, v6.4s\n"
    "fmla v24.4s, v29.4s, v19.4s\n"
    "fmla v21.4s, v29.4s, v20.4s\n"
    "ldr s26, [x24, x11]\n"
    "fmla v10.4s, v23.4s, v0.4s\n"
    "ldr s28, [x23, x13]\n"
    "fmla v17.4s, v23.4s, v3.4s\n"
    "add x24, x24, #4\n"
    "fmla v9.4s, v23.4s, v1.4s\n"
    "fmla v18.4s, v23.4s, v5.4s\n"
    "fmla v25.4s, v23.4s, v4.4s\n"
    "fmla v24.4s, v23.4s, v6.4s\n"
    "fmla v14.4s, v28.4s, v0.4s\n"
    "ldr s20, [x22, x19]\n"
    "fmla v9.4s, v26.4s, v3.4s\n"
    "fmla v13.4s, v28.4s, v1.4s\n"
    "fmla v25.4s, v26.4s, v5.4s\n"
    "ldr s26, [x20, x17]\n"
    "fmla v22.4s, v28.4s, v2.4s\n"
    "ldr s23, [x9, x11]\n"
    "fmla v15.4s, v20.4s, v0.4s\n"
    "add x9, x9, #4\n"
    "fmla v13.4s, v20.4s, v3.4s\n"
    "fmla v18.4s, v20.4s, v1.4s\n"
    "fmla v22.4s, v20.4s, v4.4s\n"
    "fmla v24.4s, v20.4s, v2.4s\n"
    "fmla v21.4s, v20.4s, v19.4s\n"
    "ldr s27, [x23, x19]\n"
    "fmla v17.4s, v26.4s, v0.4s\n"
    "ldr s20, [x22, x17]\n"
    "fmla v18.4s, v26.4s, v3.4s\n"
    "fmla v25.4s, v26.4s, v1.4s\n"
    "fmla v22.4s, v26.4s, v5.4s\n"
    "fmla v24.4s, v26.4s, v4.4s\n"
    "fmla v21.4s, v26.4s, v6.4s\n"
    "ldr s19, [x20, x11]\n"
    "fmla v9.4s, v23.4s, v0.4s\n"
    "ldr s28, [x23, x17]\n"
    "fmla v25.4s, v23.4s, v3.4s\n"
    "add x20, x20, #4\n"
    "fmla v24.4s, v23.4s, v5.4s\n"
    "ldr s29, [x22, x11]\n"
    "fmla v13.4s, v27.4s, v0.4s\n"
    "add x22, x22, #4\n"
    "fmla v22.4s, v27.4s, v1.4s\n"
    "fmla v21.4s, v27.4s, v2.4s\n"
    "fmla v18.4s, v20.4s, v0.4s\n"
    "ldr s30, [x23, x11]\n"
    "fmla v24.4s, v20.4s, v1.4s\n"
    "add x23, x23, #4\n"
    "fmla v22.4s, v20.4s, v3.4s\n"
    "fmla v21.4s, v20.4s, v4.4s\n"
    "fmla v25.4s, v19.4s, v0.4s\n"
    "movi v26.16b, #0\n"
    "fmla v24.4s, v19.4s, v3.4s\n"
    "fmov v27.4s, #6.0\n"
    "fmla v21.4s, v19.4s, v5.4s\n"
    "fmla v22.4s, v28.4s, v0.4s\n"
    "fmax v12.4s, v12.4s, v26.4s\n"
    "fmax v11.4s, v11.4s, v26.4s\n"
    "fmla v24.4s, v29.4s, v0.4s\n"
    "fmax v10.4s, v10.4s, v26.4s\n"
    "fmla v21.4s, v28.4s, v1.4s\n"
    "fmin v12.4s, v12.4s, v27.4s\n"
    "fmin v11.4s, v11.4s, v27.4s\n"
    "fmin v10.4s, v10.4s, v27.4s\n"
    "str s12, [%[outptr0]]\n"
    "fmax v9.4s, v9.4s, v26.4s\n"
    "str s11, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v21.4s, v29.4s, v3.4s\n"
    "str s10, [%[outptr0], x27]\n"
    "fmin v9.4s, v9.4s, v27.4s\n"
    "fmax v8.4s, v8.4s, v26.4s\n"
    "fmax v7.4s, v7.4s, v26.4s\n"
    "str s9, [%[outptr0], x28]\n"
    "fmla v21.4s, v30.4s, v0.4s\n"
    "fmin v8.4s, v8.4s, v27.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmin v7.4s, v7.4s, v27.4s\n"
    "fmax v17.4s, v17.4s, v26.4s\n"
    "str s8, [x8]\n"
    "fmax v25.4s, v25.4s, v26.4s\n"
    "str s7, [x8, %[output_col_stride1]]\n"
    "fmin v17.4s, v17.4s, v27.4s\n"
    "fmin v25.4s, v25.4s, v27.4s\n"
    "fmax v16.4s, v16.4s, v26.4s\n"
    "str s17, [x8, x27]\n"
    "fmax v15.4s, v15.4s, v26.4s\n"
    "str s25, [x8, x28]\n"
    "fmin v16.4s, v16.4s, v27.4s\n"
    "fmin v15.4s, v15.4s, v27.4s\n"
    "add x8, x8, #4\n"
    "str s16, [x25]\n"
    "fmax v18.4s, v18.4s, v26.4s\n"
    "str s15, [x25, %[output_col_stride1]]\n"
    "fmax v24.4s, v24.4s, v26.4s\n"
    "fmin v18.4s, v18.4s, v27.4s\n"
    "fmax v14.4s, v14.4s, v26.4s\n"
    "fmin v24.4s, v24.4s, v27.4s\n"
    "fmax v13.4s, v13.4s, v26.4s\n"
    "str s18, [x25, x27]\n"
    "fmin v14.4s, v14.4s, v27.4s\n"
    "str s24, [x25, x28]\n"
    "fmin v13.4s, v13.4s, v27.4s\n"
    "str s14, [x26]\n"
    "fmax v22.4s, v22.4s, v26.4s\n"
    "str s13, [x26, %[output_col_stride1]]\n"
    "fmax v21.4s, v21.4s, v26.4s\n"
    "fmin v22.4s, v22.4s, v27.4s\n"
    "add x25, x25, #4\n"
    "fmin v21.4s, v21.4s, v27.4s\n"
    "str s22, [x26, x27]\n"
    "str s21, [x26, x28]\n"
    "add x26, x26, #4\n"
    "7:\n"
    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float))
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v3", "v30", "v4", "v5", "v6", "v7", "v8", "v9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x7", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x8", "x9", "memory"
  );
}

#endif  // __aarch64__

template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>;

}  // namespace depthwise
